class GPT(nn.Module): def __init__(self, config): super().__init__() self.transformer = nn.ModuleDict(dict( wte = nn.Embedding(config.vocab_size, config.n_embd), wpe = nn.Embedding(config.block_size, config.n_embd), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), ln_f = nn.LayerNorm(config.n_embd), )) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) def forward(self, idx): B, T = idx.size() tok_emb = self.transformer.wte(idx) pos = torch.arange(0, T, device=idx.device).unsqueeze(0) pos_emb = self.transformer.wpe(pos) x = tok_emb + pos_emb for block in self.transformer.h: x = block(x) x = self.transformer.ln_f(x) logits = self.lm_head(x) return logits