bigram language model

二元语言模型,仅仅关注上一个单词来预测下一个单词出现的概率。
不想写了感觉这个很原始

small GPT

这个代码从0开始构建简化GPT架构,GPT是decoder only的架构,只需要串序叠加decoder块即可。
看的视频是从总体框架进行的构建,再逐步细化,我觉得可以稍微学习他的编码方式

首先就是GPTModel,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
class GPTLanguageModel(nn.Module):
def __init__(self, vocab_size, block_size, n_embed, n_layer, n_head) -> None:
super().__init__()
self.vocab_size = vocab_size
self.block_size = block_size # 一个batch中每个句子的长度
self.n_embed = n_embed
self.n_layer = n_layer
self.n_head = n_head
self.embedding_table = nn.Embedding(vocab_size, n_embed)
self.position_embedding = nn.Embedding(block_size, n_embed)
self.blocks = nn.Sequential(*[Decoder(n_embed=n_embed, n_head=n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embed)
self.lm_head = nn.Linear(n_embed, vocab_size)

self.apply(self._init_weights)

def _init_weights(self, module):
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


def forward(self, _input, target=None):
# logits是没有经过归一化的模型输出
# logits = self.embedding_table(_input)
batch, time = _input.shape
token_emb = self.embedding_table(_input)
posi_emb = self.position_embedding(torch.arange(time, device=device))
# print(f"token_em shape = {token_emb.shape}, posi_em shape = {posi_emb.shape}")
x = token_emb + posi_emb
x = self.blocks(x)
x = self.ln_f(x)
logits = self.lm_head(x)

if target is None:
loss = None
else:
batch, time, char = logits.shape
logits = logits.view(batch*time, char)
target = target.view(batch*time)
loss = F.cross_entropy(logits, target)
return logits, loss


def generate(self, _input, max_new_tokens):
for _ in range(max_new_tokens):
# 对整数索引序列进行嵌入
logits, _ = self.forward(_input)
# 代表预测仅仅关心最后一个字符
logits = logits[:, -1, :]
# print(f"losgits shape = {logits.shape}")
probs = F.softmax(logits, dim=-1)
# print(f"probs shape = {probs.shape}")
_input_next = torch.multinomial(probs, num_samples=1)
_input = torch.cat((_input, _input_next), dim=1)

return _input

generate方法还是有点问,我周末再调试一下,不过现在是能训练了,打算到时候把数据集在4090那台电脑上下载一下到4090电脑上进行一个训练看看。
原本transformer的位置编码是固定函数,但是GPT使用可学习的嵌入进行位置编码

然后就是Decoder块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class Decoder(nn.Module):
def __init__(self, n_embed, n_head) -> None:
super().__init__()
# 每个头不需要捕获全部的特征,让每个头平均地捕获n_embed的特征
head_size = n_embed // n_head
self.multihead = MultiheadAttention(n_embed, n_head, head_size)
self.ffwd = FeedForward(n_embed)
self.ln1 = nn.LayerNorm(n_embed)
self.ln2 = nn.LayerNorm(n_embed)

def forward(self, x):
x1 = self.multihead(x)
x = self.ln1(x + x1)
x2 = self.ffwd(x)
x = self.ln2(x + x2)
return x

前馈神经网络就不放了挺简单
然后就是多头注意力,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class Head(nn.Module):
def __init__(self, head_size, dropout=0.2) -> None:
super().__init__()
self.key = nn.Linear(n_embed, head_size, bias=False)
self.query = nn.Linear(n_embed, head_size, bias=False)
self.value = nn.Linear(n_embed, head_size, bias=False)
# 向模型注册tril,就不用重复进行加载
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)


def forward(self, x):
# 可以将time视作token
batch, time, channel = x.shape
k = self.key(x)
q = self.query(x)
v = self.value(x)
att = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5 # (batch, time, headsize) @ (batch, headsize, time) * dk^-0.5
att = att.masked_fill(self.tril[:time, :time] == 0, float('-inf')) # (batch, time, time)
att = F.softmax(att, dim=-1)
att = self.dropout(att)
return att @ v

class MultiheadAttention(nn.Module):
def __init__(self, n_embed, n_head, head_size, dropout=0.2) -> None:
super().__init__()
# modulelist相当于一个list,可以更自由地处理list中每个模块地输入输出
self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
self.proj = nn.Linear(head_size * n_head, n_embed)
self.dropout = nn.Dropout(dropout)

def forward(self, x):
# 将每个头按照特征维度进行组合,让每个样本的最终维度变成原来的n_embed
x = torch.cat([h(x) for h in self.heads], dim=-1)
x = self.dropout(self.proj(x))
return x

拆解下来还比较简单,
下次就是训练了

腰斩了家人们

搞不了一点,语料库好大几十G,免费存储空间完全不够存。
而且视频里也没有进行训练上的演示。
学一下diffusion去了🥲

diffusion也腰斩!
我要学强化学习!我要搞下五子棋的智能体!