DQN

深度强化学习，基础就是使用一个全连接网络来预测Q(s, a)动作价值函数，
这个动作价值函数指的是在一个状态下采取一个动作的价值。
直接看定义吧

class QNet(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.ReLU()
        )

    def forward(self, x):
        return self.net(x)

然后在训练过程中，由于没有样本，需要将agent与环境交互获得的状态动作回报下一状态是否完成等等内容填入buffer，再满足一定数量样本之后就随机采样对Q网络进行一个训练，注意到Q网络输入是一个（B， state_dim）的形状，一行就是对一个状态的描述，输出是（B，action_dim），输出每一列存储的都是Q（s, a）。
然后是关于buffer的定义，因为这个课本的源代码在我下载的包的版本中似乎有一点问题，我就用了statenode进行变量类型的控制，所有变量都以statenode的形式传输（当然不一定能做到所有啦）

class StateNode:
    def __init__(self, *, state: torch.Tensor, action: torch.Tensor, reward: int, next_state: torch.Tensor, done: bool):
        self.state = state
        if isinstance(state, tuple):
            self.state = torch.tensor(state[0])
        if isinstance(state, np.ndarray):
            self.state = torch.tensor(state)
        # print(f"state node action is {action}, type is {type(action)}")
        # print(f"state node action is {action}, type is {type(action)}")
        self.action = action
        self.reward = reward
        self.done = done
        self.next_state = next_state

        if isinstance(next_state, tuple):
            self.next_state = torch.tensor(next_state[0])
        if isinstance(next_state, np.ndarray):
            self.next_state = torch.tensor(next_state)


class ReplayBuffer:
    def __init__(self, capacity: int, state_dim: int):
        self.buffer = collections.deque(maxlen=capacity)
        self.state_dim = state_dim

    def add(self, item: StateNode):
        if not isinstance(item.action, torch.Tensor):
            raise ValueError(f"action must be Tensor not {type(item.action)}")
        self.buffer.append(item)

    def sample(self, batch_size):
        transition = random.sample(self.buffer, batch_size)
        states = transition[0].state
        actions = transition[0].action
        if actions.ndim == 0:
            actions = actions.unsqueeze(0)
        rewards = [transition[0].reward]
        next_states = transition[0].next_state
        dones = [transition[0].done]
        if len(self.buffer) > 1:
            for node in transition[1:]:
                states = torch.cat((states, node.state), dim=0)
                # print(f"sample action is {node.action}, type is {type(node.action)}")
                if node.action.ndim == 0:
                    node.action = node.action.unsqueeze(0)
                actions = torch.cat((actions, node.action), dim=0)
                next_states = torch.cat((next_states, node.next_state), dim=0)
                rewards.append(node.reward)
                dones.append(node.done)
        return states.reshape((batch_size, self.state_dim)), actions.reshape((-1, 1)), torch.tensor(rewards).reshape(
            (-1, 1)), next_states.reshape((batch_size, self.state_dim)), torch.tensor(dones)

    def __len__(self) -> int:
        return len(self.buffer)

然后就是DQN主体的代码了，作为要实例agent的类，需要有采取行动和更新策略的能力，在DQN中主要有两个网络进行行动获取与策略获取，一个是policy_net一个是target_net，policy_net用来获取当前动作和当前动作价值，target_net是用来获取最大动作价值，通过计算TD error来更新policy_net，来让policy_net能趋向于target_net也就是取最佳的，然后经过一定步骤后将target_net以policy_net赋值以保证最大动作价值网络的有效性。

class DQN:
    def __init__(self, state_dim: int, hidden_dim: int, action_dim: int, gamma: float, epsilon: float, lr: float,
                 targetUpdateFrequent: int, dqnType: str):
        if dqnType not in ["DQN", "DoubleDQN", "DuelingDQN"]:
            raise ValueError(f"illegal dqnType = {dqnType}")
        self.dqnType = dqnType
        if self.dqnType == 'DQN' or 'DoubleDQN':
            self.policy_net = QNet(state_dim=state_dim,
                                   hidden_dim=hidden_dim,
                                   action_dim=action_dim)
            self.target_net = QNet(state_dim=state_dim,
                                   hidden_dim=hidden_dim,
                                   action_dim=action_dim)
        elif self.dqnType == 'DuelingDQN':
            self.policy_net = VANet(state_dim=state_dim,
                                   hidden_dim=hidden_dim,
                                   action_dim=action_dim)
            self.target_net = VANet(state_dim=state_dim,
                                   hidden_dim=hidden_dim,
                                   action_dim=action_dim)
        self.gamma = gamma
        self.epsilon = epsilon
        self.action_dim = action_dim
        self.optim = torch.optim.Adam(self.policy_net.parameters(), lr=lr)
        self.crit = nn.MSELoss()
        self.count = 0
        self.targetUpdateFrequent = targetUpdateFrequent

    def take_action(self, state: torch.Tensor):
        if not isinstance(state, torch.Tensor):
            print(f"error state is {state}")
            raise ValueError(f"state must be tensor not {type(state)}")
        p = np.random.random()
        if p < self.epsilon:
            action = torch.tensor(np.random.randint(self.action_dim))
        else:
            Qsa = self.policy_net(state)
            if Qsa.dim() == 1:
                Qsa = Qsa.unsqueeze(0)
            _, action = torch.max(Qsa, dim=1)
        return action

    def max_q_value(self, state: StateNode):
        return self.policy_net(state).max().item()

    def update(self, stateNode: StateNode):
        self.optim.zero_grad()
        Qsa = self.policy_net(stateNode.state)
        q_value = Qsa.gather(1, stateNode.action)
        if self.dqnType == "DQN":
            max_q_value = self.target_net(stateNode.next_state).max(1)[0].view(-1, 1)
        elif self.dqnType == "DoubleDQN":
            max_action = self.policy_net(stateNode.next_state).max(1)[1].view(-1, 1)
            max_q_value = self.target_net(stateNode.next_state).gather(1, max_action)

        # print(f"reward shape is {stateNode.reward.shape}")
        # print(f"max_q_value shape is {max_q_value.shape}")
        # print(f"torch.logical_not(stateNode.done) shape is {torch.logical_not(stateNode.done).shape}")
        target_q_value = stateNode.reward.unsqueeze(1) + self.gamma * max_q_value * torch.logical_not(
            stateNode.done).unsqueeze(1)
        # print(f"target q value is {target_q_value}, shape is {target_q_value.shape}")
        # print(f"target_q_value dtype is {target_q_value.dtype}, q_value dtype is {q_value.dtype}")
        target_q_value = target_q_value.to(torch.double)
        q_value = q_value.to(torch.double)
        l = self.crit(target_q_value, q_value)
        l.backward()
        self.optim.step()
        if self.count % self.targetUpdateFrequent == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
        self.count += 1

这里介绍一下DoubleDQN与DuelingDQN

DoubleDQN

因为传统DQN存在着预测值偏高的问题（~~没看证明我也不知道怎么回事~~），这个Q值偏高来自于Q值直接就来自于target_net的最大动作价值也就是Q_max(s, a_max)，为了解决这个问题，就想让a_max的获取来自于policy_net，再由target_net对这个动作计算动作价值这个也就是代码

1 2	max_action = self.policy_net(stateNode.next_state).max(1)[1].view(-1, 1) max_q_value = self.target_net(stateNode.next_state).gather(1, max_action)

的意思

DuelingDQN

改了一下Qnet的结构，让Qnet的最后一层输出一个A输出一个V，然后计算Q值
具体AV是什么呢，A是采取的动作，V是状态价值
Dueling的动作价值通过状态价值V减去优势函数A
优势函数指的是Q(s, a) - V(s),该状态下所有动作价值函数相加为0

class VANet(torch.nn.Module):
    ''' 只有一层隐藏层的A网络和V网络 '''

    def __init__(self, state_dim, hidden_dim, action_dim):
        super(VANet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)  # 共享网络部分
        self.fc_A = torch.nn.Linear(hidden_dim, action_dim)
        self.relu = nn.ReLU()
        self.fc_V = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x):
        A = self.fc_A(self.relu.relu(self.fc1(x)))
        V = self.fc_V(self.relu.relu(self.fc1(x)))
        Q = V + A - A.mean(1).view(-1, 1)  # Q值由V值和A值计算得到
        return Q

我们可以看到，如果V加上k而A减去K，此时的Q值依然会不变，两者是经过不同的层输出的，具有不确定性，为了让两者一致，就可以用减去均值化A，这样在优化Q的时候让两者统一.