bclarkson-code / Tricycle

Autograd to GPT-2 completely from scratch
104 stars 7 forks source link

optimised embedding #59

Closed bclarkson-code closed 3 months ago

bclarkson-code commented 3 months ago

The embedding layer has been optimsed and is now ~8.5x faster! I ran this benchmark:

import numpy as np

from tricycle.layers import Embedding, EmbeddingV2
from tricycle.tensor import to_tensor

N_LOOPS = 100
DEVICE = 1

def test_embedding_original():
    np.random.seed(0)
    x = np.random.randint(0, 64, size=64)
    x = to_tensor(x, is_vector=True, requires_grad=False, dtype=int)
    x.to_gpu(DEVICE)

    layer = Embedding(from_size=64, to_size=1024)
    layer.to_gpu(DEVICE)

    for _ in range(N_LOOPS):
        out = layer(x)
        out.backward()

def test_embedding_new():
    np.random.seed(0)
    x = np.random.randint(0, 64, size=64)
    x = to_tensor(x, is_vector=True, requires_grad=False, dtype=int)
    x.to_gpu(DEVICE)

    layer = EmbeddingV2(from_size=64, to_size=1024)
    layer.to_gpu(DEVICE)

    for _ in range(N_LOOPS):
        out = layer(x)
        out.backward()

__benchmarks__ = [(test_embedding_original, test_embedding_new, "original")]

And richbench gave this:


┏━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Benchmark ┃ Min     ┃ Max     ┃ Mean    ┃ Min (+)         ┃ Max (+)         ┃ Mean (+)        ┃
┡━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│  original │ 0.677   │ 0.744   │ 0.692   │ 0.080 (8.5x)    │ 0.081 (9.2x)    │ 0.080 (8.6x)    │
└───────────┴─────────┴─────────┴─────────┴─────────────────┴─────────────────┴─────────────────┘

The main changes were simply removing python lists and replacing them with proper vectorised operations.