From 03f64187c66a1a1d7cfac5e00d972fefa15d698b Mon Sep 17 00:00:00 2001 From: 152334H <54623771+152334H@users.noreply.github.com> Date: Sat, 25 May 2024 23:04:18 +0800 Subject: [PATCH 1/6] fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b462662..1192fc1 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ python examples/data/shakespeare.py And finally, let's train a GPT: ```bash -python examples/train-GPT.py +python examples/train-gpt.py ``` This runs on CPU and should get train loss: 1.65 and test loss: 1.80 after 2000 iterations. From 5ce1f717d5bfe6d6e33a09710234ba8758c7fa3b Mon Sep 17 00:00:00 2001 From: 152334H <54623771+152334H@users.noreply.github.com> Date: Sun, 26 May 2024 00:04:53 +0800 Subject: [PATCH 2/6] gpt flops counters --- examples/train-gpt.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/examples/train-gpt.py b/examples/train-gpt.py index a05587e..6f464e0 100644 --- a/examples/train-gpt.py +++ b/examples/train-gpt.py @@ -11,6 +11,34 @@ d_value = 32 num_blocks = 4 +GPU_16BIT_FLOPS = { + "h100-sxm": 1.979e15 / 2, + "h100-pcie": 1.513e15 / 2, + "a100": 312e12, + "v100-sxm": 125e12, + "6000A": 364.25e12, + "4090": 165.2 * 10**12, + "3090": 71 * 10**12, + "t4": 65e12, +} +def xf_layer_fwd_flops(slen: int, bs: int=1, causal=True) -> int: + p_mlp = d_embed * 4 * d_embed * 2 + f_mlp = p_mlp * 2 * slen + + assert d_query == d_value, "Dq != Dv not implemented" + p_att = 4 * d_embed * d_embed + f_att = p_att * 2 * slen + f_sdpa = 4 * slen * slen * d_embed // (2 if causal else 1) # approximation + + return (f_mlp + f_att + f_sdpa) * bs + +def gpt_train_flops(slen: int, bs: int, causal=True) -> int: + # lmhead layer: + flops = 6 * slen * bs * d_embed * vocab_size + # assume no activation checkpointing + flops += num_blocks * xf_layer_fwd_flops(slen, bs, causal) * 3 + return flops + # training hparams init_lr = 0.5 From 4333a249d918b6fb97799a09ba0985291e459479 Mon Sep 17 00:00:00 2001 From: 152334H <54623771+152334H@users.noreply.github.com> Date: Sun, 26 May 2024 00:06:42 +0800 Subject: [PATCH 3/6] compilable cuda bf16 train-gpt.py --- examples/train-gpt.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/examples/train-gpt.py b/examples/train-gpt.py index 6f464e0..8f323c7 100644 --- a/examples/train-gpt.py +++ b/examples/train-gpt.py @@ -108,8 +108,9 @@ def __len__(self): # now let's start doing stuff -if __name__ == "__main__": +@torch.cuda.amp.autocast(dtype=torch.bfloat16) +def train(device, ideal_flops_per_sec): # load the data trainset = SimpleLLMDataset(np.memmap("examples/data/shakespeare/train.bin", dtype=np.uint16, mode='r'), context) @@ -124,12 +125,15 @@ def __len__(self): train_iterator = iter(train_loader) test_iterator = iter(test_loader) - getBatch = lambda train: next(train_iterator if train else test_iterator) + def getBatch(train: bool) -> list: + res = next(train_iterator if train else test_iterator) + return [t.to(device=device) for t in res] # load the model gpt = GPT(vocab_size, context, num_heads, d_embed, d_query, d_value, num_blocks) - weights = gpt.initialize(device="cpu") + weights = gpt.initialize(device=device) + gpt.forward = torch.compile(gpt.forward) # initialize the Adam state @@ -188,6 +192,18 @@ def __len__(self): weights.zero_grad() if step % log_interval == 0: - print( "step:", step, - "\t train loss:", "%.2f" % train_loss.item(), - "\t test loss:", "%.2f" % test_loss.item() ) + print( + "step:", step, + "\t train loss:", "%.2f" % train_loss.item(), + "\t test loss:", "%.2f" % test_loss.item(), + ) + + +if __name__ == "__main__": + import argparse + ap = argparse.ArgumentParser() + ap.add_argument('--cuda', action='store_true') + args = ap.parse_args() + + torch.set_float32_matmul_precision("medium") + train('cuda' if args.cuda else 'cpu', GPU_16BIT_FLOPS['3090']) From 439351d5a6ebdc62cc467e5259a7c1f8f68cbc6b Mon Sep 17 00:00:00 2001 From: 152334H <54623771+152334H@users.noreply.github.com> Date: Sun, 26 May 2024 00:07:35 +0800 Subject: [PATCH 4/6] tokens-per-sec and model-flops-utilization logging --- examples/train-gpt.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/examples/train-gpt.py b/examples/train-gpt.py index 8f323c7..5a559cb 100644 --- a/examples/train-gpt.py +++ b/examples/train-gpt.py @@ -1,3 +1,4 @@ +import time import torch import numpy as np @@ -39,6 +40,21 @@ def gpt_train_flops(slen: int, bs: int, causal=True) -> int: flops += num_blocks * xf_layer_fwd_flops(slen, bs, causal) * 3 return flops +class SpeedLogger: + def __init__(self, ideal_flops_per_sec: float): + self.tps = [] + self.mfu = [] + self.fps = ideal_flops_per_sec + + def add(self, slen: int, bs: int, duration: float) -> tuple[float,float]: + flops = gpt_train_flops(slen, bs) + self.tps.append(slen*bs / duration) + self.mfu.append(flops / duration / self.fps) + return self.tps[-1], self.mfu[-1] + + def ave(self): + return sum(self.tps) / len(self.tps), sum(self.mfu) / len(self.mfu) + # training hparams init_lr = 0.5 @@ -146,6 +162,8 @@ def getBatch(train: bool) -> list: # train the model + speed_logger = SpeedLogger(ideal_flops_per_sec) + for step in range(steps): if step % log_interval == 0: @@ -163,6 +181,7 @@ def getBatch(train: bool) -> list: test_loss /= eval_steps test_acc /= eval_steps + t0 = time.time() data, target = getBatch(train = True) output = gpt.forward(data, weights) output = output.view(-1, output.size(-1)) @@ -191,11 +210,16 @@ def getBatch(train: bool) -> list: gpt.regularize(weights, strength = init_lr * schedule * wd) weights.zero_grad() + speed_logger.add(*data.shape, time.time() - t0) + if step % log_interval == 0: + tps, mfu = speed_logger.ave() print( "step:", step, - "\t train loss:", "%.2f" % train_loss.item(), + "\t train loss:", "%.2f" % train_loss.item(), "\t test loss:", "%.2f" % test_loss.item(), + f"\t tokens/gpu/sec: {tps:.2f}", + f"\t MFU: {mfu*100:.2f}%", ) From 1246cf86e75c13171262fd6b1acd0b0e3330cad6 Mon Sep 17 00:00:00 2001 From: 152334H <54623771+152334H@users.noreply.github.com> Date: Sun, 26 May 2024 00:08:28 +0800 Subject: [PATCH 5/6] change hparams to just barely fit on 3090 --- examples/train-gpt.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/examples/train-gpt.py b/examples/train-gpt.py index 5a559cb..c4861ff 100644 --- a/examples/train-gpt.py +++ b/examples/train-gpt.py @@ -12,6 +12,15 @@ d_value = 32 num_blocks = 4 +# Llama-7b-like values, excluding the vocabulary size. +vocab_size = 256 +context = 1024 +num_heads = 32 +d_embed = 4096 +d_query = 128 +d_value = 128 +num_blocks = 4 + GPU_16BIT_FLOPS = { "h100-sxm": 1.979e15 / 2, "h100-pcie": 1.513e15 / 2, @@ -59,10 +68,10 @@ def ave(self): init_lr = 0.5 wd = 0.01 -batch_size = 12 +batch_size = 2 # 12 steps = 2001 eval_steps = 100 -log_interval = 200 +log_interval = 10 # 200 # let's start by defining our GPT architecture # (we could instead just import GPT from modula.compound) From 37669edf7a78adcc8e7b71686d20349daf58e5bf Mon Sep 17 00:00:00 2001 From: 152334H <54623771+152334H@users.noreply.github.com> Date: Sun, 26 May 2024 21:49:14 +0800 Subject: [PATCH 6/6] avoid logging first 2 steps && insert commented failed compiles --- examples/train-gpt.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/train-gpt.py b/examples/train-gpt.py index c4861ff..ea327f9 100644 --- a/examples/train-gpt.py +++ b/examples/train-gpt.py @@ -159,6 +159,9 @@ def getBatch(train: bool) -> list: gpt = GPT(vocab_size, context, num_heads, d_embed, d_query, d_value, num_blocks) weights = gpt.initialize(device=device) gpt.forward = torch.compile(gpt.forward) + # gpt.normalize = torch.compile(gpt.normalize) + # gpt.regularize = torch.compile(gpt.regularize) + # init_lr_t = torch.tensor(init_lr, device=device) # initialize the Adam state @@ -219,9 +222,11 @@ def getBatch(train: bool) -> list: gpt.regularize(weights, strength = init_lr * schedule * wd) weights.zero_grad() - speed_logger.add(*data.shape, time.time() - t0) + # avoid first compile && first recompile + if step > 1: + speed_logger.add(*data.shape, time.time() - t0) - if step % log_interval == 0: + if step > 1 and step % log_interval == 0: tps, mfu = speed_logger.ave() print( "step:", step,