bigram language model
二元语言模型,仅仅关注上一个单词来预测下一个单词出现的概率。
不想写了感觉这个很原始
small GPT
这个代码从0开始构建简化GPT架构,GPT是decoder only的架构,只需要串序叠加decoder块即可。
看的视频是从总体框架进行的构建,再逐步细化,我觉得可以稍微学习他的编码方式
首先就是GPTModel,
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
| class GPTLanguageModel(nn.Module): def __init__(self, vocab_size, block_size, n_embed, n_layer, n_head) -> None: super().__init__() self.vocab_size = vocab_size self.block_size = block_size self.n_embed = n_embed self.n_layer = n_layer self.n_head = n_head self.embedding_table = nn.Embedding(vocab_size, n_embed) self.position_embedding = nn.Embedding(block_size, n_embed) self.blocks = nn.Sequential(*[Decoder(n_embed=n_embed, n_head=n_head) for _ in range(n_layer)]) self.ln_f = nn.LayerNorm(n_embed) self.lm_head = nn.Linear(n_embed, vocab_size)
self.apply(self._init_weights)
def _init_weights(self, module): if isinstance(module, nn.Linear): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) if module.bias is not None: torch.nn.init.zeros_(module.bias) elif isinstance(module, nn.Embedding): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, _input, target=None): batch, time = _input.shape token_emb = self.embedding_table(_input) posi_emb = self.position_embedding(torch.arange(time, device=device)) x = token_emb + posi_emb x = self.blocks(x) x = self.ln_f(x) logits = self.lm_head(x)
if target is None: loss = None else: batch, time, char = logits.shape logits = logits.view(batch*time, char) target = target.view(batch*time) loss = F.cross_entropy(logits, target) return logits, loss
def generate(self, _input, max_new_tokens): for _ in range(max_new_tokens): logits, _ = self.forward(_input) logits = logits[:, -1, :] probs = F.softmax(logits, dim=-1) _input_next = torch.multinomial(probs, num_samples=1) _input = torch.cat((_input, _input_next), dim=1)
return _input
|
generate方法还是有点问,我周末再调试一下,不过现在是能训练了,打算到时候把数据集在4090那台电脑上下载一下到4090电脑上进行一个训练看看。
原本transformer的位置编码是固定函数,但是GPT使用可学习的嵌入进行位置编码
然后就是Decoder块
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| class Decoder(nn.Module): def __init__(self, n_embed, n_head) -> None: super().__init__() head_size = n_embed // n_head self.multihead = MultiheadAttention(n_embed, n_head, head_size) self.ffwd = FeedForward(n_embed) self.ln1 = nn.LayerNorm(n_embed) self.ln2 = nn.LayerNorm(n_embed)
def forward(self, x): x1 = self.multihead(x) x = self.ln1(x + x1) x2 = self.ffwd(x) x = self.ln2(x + x2) return x
|
前馈神经网络就不放了挺简单
然后就是多头注意力,
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| class Head(nn.Module): def __init__(self, head_size, dropout=0.2) -> None: super().__init__() self.key = nn.Linear(n_embed, head_size, bias=False) self.query = nn.Linear(n_embed, head_size, bias=False) self.value = nn.Linear(n_embed, head_size, bias=False) self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) self.dropout = nn.Dropout(dropout)
def forward(self, x): batch, time, channel = x.shape k = self.key(x) q = self.query(x) v = self.value(x) att = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5 att = att.masked_fill(self.tril[:time, :time] == 0, float('-inf')) att = F.softmax(att, dim=-1) att = self.dropout(att) return att @ v
class MultiheadAttention(nn.Module): def __init__(self, n_embed, n_head, head_size, dropout=0.2) -> None: super().__init__() self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)]) self.proj = nn.Linear(head_size * n_head, n_embed) self.dropout = nn.Dropout(dropout) def forward(self, x): x = torch.cat([h(x) for h in self.heads], dim=-1) x = self.dropout(self.proj(x)) return x
|
拆解下来还比较简单,
下次就是训练了
腰斩了家人们
搞不了一点,语料库好大几十G,免费存储空间完全不够存。
而且视频里也没有进行训练上的演示。
学一下diffusion去了🥲
diffusion也腰斩!
我要学强化学习!我要搞下五子棋的智能体!