Open BasedLukas opened 1 year ago
In the colab notebook linked under your YT video the dimensions for the single headed attention appear to be incorrect.
class Head(nn.Module): """ one head of self-attention """ def __init__(self, head_size): super().__init__() self.key = nn.Linear(n_embd, head_size, bias=False) self.query = nn.Linear(n_embd, head_size, bias=False) self.value = nn.Linear(n_embd, head_size, bias=False) self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) self.dropout = nn.Dropout(dropout) def forward(self, x): B,T,C = x.shape k = self.key(x) # (B,T,C) q = self.query(x) # (B,T,C) # compute attention scores ("affinities") wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T) wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) wei = F.softmax(wei, dim=-1) # (B, T, T) wei = self.dropout(wei) # perform the weighted aggregation of the values v = self.value(x) # (B,T,C) out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C) return out
I believe v.shape is not BTC but rather B,T, hs. In this repository it is correct:
class Head(nn.Module): """ one head of self-attention """ def __init__(self, head_size): super().__init__() self.key = nn.Linear(n_embd, head_size, bias=False) self.query = nn.Linear(n_embd, head_size, bias=False) self.value = nn.Linear(n_embd, head_size, bias=False) self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) self.dropout = nn.Dropout(dropout) def forward(self, x): # input of size (batch, time-step, channels) # output of size (batch, time-step, head size) B,T,C = x.shape k = self.key(x) # (B,T,hs) q = self.query(x) # (B,T,hs) # compute attention scores ("affinities") wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) wei = F.softmax(wei, dim=-1) # (B, T, T) wei = self.dropout(wei) # perform the weighted aggregation of the values v = self.value(x) # (B,T,hs) out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs) return out
This caused my some confusion, maybe you could change it? Thank you for such a wonderful, educational project!
In the colab notebook linked under your YT video the dimensions for the single headed attention appear to be incorrect.
I believe v.shape is not BTC but rather B,T, hs. In this repository it is correct:
This caused my some confusion, maybe you could change it?
Thank you for such a wonderful, educational project!