bigram language model

二元语言模型，仅仅关注上一个单词来预测下一个单词出现的概率。
不想写了感觉这个很原始

small GPT

这个代码从0开始构建简化GPT架构，GPT是decoder only的架构，只需要串序叠加decoder块即可。
看的视频是从总体框架进行的构建，再逐步细化，我觉得可以稍微学习他的编码方式

首先就是GPTModel,

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size, block_size, n_embed, n_layer, n_head) -> None:
        super().__init__()
        self.vocab_size = vocab_size
        self.block_size = block_size # 一个batch中每个句子的长度
        self.n_embed = n_embed
        self.n_layer = n_layer
        self.n_head = n_head
        self.embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Decoder(n_embed=n_embed, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    
    def forward(self, _input, target=None):
        # logits是没有经过归一化的模型输出
        # logits = self.embedding_table(_input)
        batch, time = _input.shape
        token_emb = self.embedding_table(_input)
        posi_emb = self.position_embedding(torch.arange(time, device=device))
        # print(f"token_em shape = {token_emb.shape}, posi_em shape = {posi_emb.shape}")
        x = token_emb + posi_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if target is None:
            loss = None
        else:
            batch, time, char = logits.shape
            logits = logits.view(batch*time, char)
            target = target.view(batch*time)
            loss = F.cross_entropy(logits, target)
        return logits, loss
    

    def generate(self, _input, max_new_tokens):
        for _ in range(max_new_tokens):
            # 对整数索引序列进行嵌入
            logits, _ = self.forward(_input)
            # 代表预测仅仅关心最后一个字符
            logits = logits[:, -1, :]
            # print(f"losgits shape = {logits.shape}")
            probs = F.softmax(logits, dim=-1)
            # print(f"probs shape = {probs.shape}")
            _input_next = torch.multinomial(probs, num_samples=1)
            _input = torch.cat((_input, _input_next), dim=1)

        return _input

generate方法还是有点问，我周末再调试一下，不过现在是能训练了，打算到时候把数据集在4090那台电脑上下载一下到4090电脑上进行一个训练看看。
原本transformer的位置编码是固定函数，但是GPT使用可学习的嵌入进行位置编码

然后就是Decoder块

class Decoder(nn.Module):
    def __init__(self, n_embed, n_head) -> None:
        super().__init__()
        # 每个头不需要捕获全部的特征，让每个头平均地捕获n_embed的特征
        head_size = n_embed // n_head
        self.multihead = MultiheadAttention(n_embed, n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x1 = self.multihead(x)
        x = self.ln1(x + x1)
        x2 = self.ffwd(x)
        x = self.ln2(x + x2)
        return x

前馈神经网络就不放了挺简单
然后就是多头注意力，

class Head(nn.Module):
    def __init__(self, head_size, dropout=0.2) -> None:
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        # 向模型注册tril，就不用重复进行加载
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        # 可以将time视作token
        batch, time, channel = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        att = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5 # (batch, time, headsize) @ (batch, headsize, time) * dk^-0.5
        att = att.masked_fill(self.tril[:time, :time] == 0, float('-inf')) # (batch, time, time)
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        return att @ v

class MultiheadAttention(nn.Module):
    def __init__(self, n_embed, n_head, head_size, dropout=0.2) -> None:
        super().__init__()
        # modulelist相当于一个list，可以更自由地处理list中每个模块地输入输出
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(head_size * n_head, n_embed)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        # 将每个头按照特征维度进行组合，让每个样本的最终维度变成原来的n_embed
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.dropout(self.proj(x))
        return x

拆解下来还比较简单，
下次就是训练了

腰斩了家人们

搞不了一点，语料库好大几十G，免费存储空间完全不够存。
而且视频里也没有进行训练上的演示。
学一下diffusion去了🥲

diffusion也腰斩！
我要学强化学习！我要搞下五子棋的智能体！