DQN

深度强化学习,基础就是使用一个全连接网络来预测Q(s, a)动作价值函数,
这个动作价值函数指的是在一个状态下采取一个动作的价值。
直接看定义吧

1
2
3
4
5
6
7
8
9
10
11
12
class QNet(nn.Module):
def __init__(self, state_dim, hidden_dim, action_dim, *args, **kwargs):
super().__init__(*args, **kwargs)
self.net = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.ReLU()
)

def forward(self, x):
return self.net(x)

然后在训练过程中,由于没有样本,需要将agent与环境交互获得的状态动作回报下一状态是否完成等等内容填入buffer,再满足一定数量样本之后就随机采样对Q网络进行一个训练,注意到Q网络输入是一个(B, state_dim)的形状,一行就是对一个状态的描述,输出是(B,action_dim),输出每一列存储的都是Q(s, a)。
然后是关于buffer的定义,因为这个课本的源代码在我下载的包的版本中似乎有一点问题,我就用了statenode进行变量类型的控制,所有变量都以statenode的形式传输(当然不一定能做到所有啦)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class StateNode:
def __init__(self, *, state: torch.Tensor, action: torch.Tensor, reward: int, next_state: torch.Tensor, done: bool):
self.state = state
if isinstance(state, tuple):
self.state = torch.tensor(state[0])
if isinstance(state, np.ndarray):
self.state = torch.tensor(state)
# print(f"state node action is {action}, type is {type(action)}")
# print(f"state node action is {action}, type is {type(action)}")
self.action = action
self.reward = reward
self.done = done
self.next_state = next_state

if isinstance(next_state, tuple):
self.next_state = torch.tensor(next_state[0])
if isinstance(next_state, np.ndarray):
self.next_state = torch.tensor(next_state)


class ReplayBuffer:
def __init__(self, capacity: int, state_dim: int):
self.buffer = collections.deque(maxlen=capacity)
self.state_dim = state_dim

def add(self, item: StateNode):
if not isinstance(item.action, torch.Tensor):
raise ValueError(f"action must be Tensor not {type(item.action)}")
self.buffer.append(item)

def sample(self, batch_size):
transition = random.sample(self.buffer, batch_size)
states = transition[0].state
actions = transition[0].action
if actions.ndim == 0:
actions = actions.unsqueeze(0)
rewards = [transition[0].reward]
next_states = transition[0].next_state
dones = [transition[0].done]
if len(self.buffer) > 1:
for node in transition[1:]:
states = torch.cat((states, node.state), dim=0)
# print(f"sample action is {node.action}, type is {type(node.action)}")
if node.action.ndim == 0:
node.action = node.action.unsqueeze(0)
actions = torch.cat((actions, node.action), dim=0)
next_states = torch.cat((next_states, node.next_state), dim=0)
rewards.append(node.reward)
dones.append(node.done)
return states.reshape((batch_size, self.state_dim)), actions.reshape((-1, 1)), torch.tensor(rewards).reshape(
(-1, 1)), next_states.reshape((batch_size, self.state_dim)), torch.tensor(dones)

def __len__(self) -> int:
return len(self.buffer)

然后就是DQN主体的代码了,作为要实例agent的类,需要有采取行动和更新策略的能力,在DQN中主要有两个网络进行行动获取与策略获取,一个是policy_net一个是target_net,policy_net用来获取当前动作和当前动作价值,target_net是用来获取最大动作价值,通过计算TD error来更新policy_net,来让policy_net能趋向于target_net也就是取最佳的,然后经过一定步骤后将target_net以policy_net赋值以保证最大动作价值网络的有效性。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
class DQN:
def __init__(self, state_dim: int, hidden_dim: int, action_dim: int, gamma: float, epsilon: float, lr: float,
targetUpdateFrequent: int, dqnType: str):
if dqnType not in ["DQN", "DoubleDQN", "DuelingDQN"]:
raise ValueError(f"illegal dqnType = {dqnType}")
self.dqnType = dqnType
if self.dqnType == 'DQN' or 'DoubleDQN':
self.policy_net = QNet(state_dim=state_dim,
hidden_dim=hidden_dim,
action_dim=action_dim)
self.target_net = QNet(state_dim=state_dim,
hidden_dim=hidden_dim,
action_dim=action_dim)
elif self.dqnType == 'DuelingDQN':
self.policy_net = VANet(state_dim=state_dim,
hidden_dim=hidden_dim,
action_dim=action_dim)
self.target_net = VANet(state_dim=state_dim,
hidden_dim=hidden_dim,
action_dim=action_dim)
self.gamma = gamma
self.epsilon = epsilon
self.action_dim = action_dim
self.optim = torch.optim.Adam(self.policy_net.parameters(), lr=lr)
self.crit = nn.MSELoss()
self.count = 0
self.targetUpdateFrequent = targetUpdateFrequent

def take_action(self, state: torch.Tensor):
if not isinstance(state, torch.Tensor):
print(f"error state is {state}")
raise ValueError(f"state must be tensor not {type(state)}")
p = np.random.random()
if p < self.epsilon:
action = torch.tensor(np.random.randint(self.action_dim))
else:
Qsa = self.policy_net(state)
if Qsa.dim() == 1:
Qsa = Qsa.unsqueeze(0)
_, action = torch.max(Qsa, dim=1)
return action

def max_q_value(self, state: StateNode):
return self.policy_net(state).max().item()

def update(self, stateNode: StateNode):
self.optim.zero_grad()
Qsa = self.policy_net(stateNode.state)
q_value = Qsa.gather(1, stateNode.action)
if self.dqnType == "DQN":
max_q_value = self.target_net(stateNode.next_state).max(1)[0].view(-1, 1)
elif self.dqnType == "DoubleDQN":
max_action = self.policy_net(stateNode.next_state).max(1)[1].view(-1, 1)
max_q_value = self.target_net(stateNode.next_state).gather(1, max_action)

# print(f"reward shape is {stateNode.reward.shape}")
# print(f"max_q_value shape is {max_q_value.shape}")
# print(f"torch.logical_not(stateNode.done) shape is {torch.logical_not(stateNode.done).shape}")
target_q_value = stateNode.reward.unsqueeze(1) + self.gamma * max_q_value * torch.logical_not(
stateNode.done).unsqueeze(1)
# print(f"target q value is {target_q_value}, shape is {target_q_value.shape}")
# print(f"target_q_value dtype is {target_q_value.dtype}, q_value dtype is {q_value.dtype}")
target_q_value = target_q_value.to(torch.double)
q_value = q_value.to(torch.double)
l = self.crit(target_q_value, q_value)
l.backward()
self.optim.step()
if self.count % self.targetUpdateFrequent == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
self.count += 1

这里介绍一下DoubleDQN与DuelingDQN

DoubleDQN

因为传统DQN存在着预测值偏高的问题(没看证明我也不知道怎么回事),这个Q值偏高来自于Q值直接就来自于target_net的最大动作价值也就是Q_max(s, a_max),为了解决这个问题,就想让a_max的获取来自于policy_net,再由target_net对这个动作计算动作价值这个也就是代码

1
2
max_action = self.policy_net(stateNode.next_state).max(1)[1].view(-1, 1)
max_q_value = self.target_net(stateNode.next_state).gather(1, max_action)

的意思

DuelingDQN

改了一下Qnet的结构,让Qnet的最后一层输出一个A输出一个V,然后计算Q值
具体AV是什么呢,A是采取的动作,V是状态价值
Dueling的动作价值通过状态价值V减去优势函数A
优势函数指的是Q(s, a) - V(s),该状态下所有动作价值函数相加为0

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class VANet(torch.nn.Module):
''' 只有一层隐藏层的A网络和V网络 '''

def __init__(self, state_dim, hidden_dim, action_dim):
super(VANet, self).__init__()
self.fc1 = torch.nn.Linear(state_dim, hidden_dim) # 共享网络部分
self.fc_A = torch.nn.Linear(hidden_dim, action_dim)
self.relu = nn.ReLU()
self.fc_V = torch.nn.Linear(hidden_dim, 1)

def forward(self, x):
A = self.fc_A(self.relu.relu(self.fc1(x)))
V = self.fc_V(self.relu.relu(self.fc1(x)))
Q = V + A - A.mean(1).view(-1, 1) # Q值由V值和A值计算得到
return Q

我们可以看到,如果V加上k而A减去K,此时的Q值依然会不变,两者是经过不同的层输出的,具有不确定性,为了让两者一致,就可以用减去均值化A,这样在优化Q的时候让两者统一.