From 03f64187c66a1a1d7cfac5e00d972fefa15d698b Mon Sep 17 00:00:00 2001
From: 152334H <54623771+152334H@users.noreply.github.com>
Date: Sat, 25 May 2024 23:04:18 +0800
Subject: [PATCH 1/6] fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b462662..1192fc1 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ python examples/data/shakespeare.py
 
 And finally, let's train a GPT:
 ```bash
-python examples/train-GPT.py
+python examples/train-gpt.py
 ```
 
 This runs on CPU and should get train loss: 1.65 and test loss: 1.80 after 2000 iterations.

From 5ce1f717d5bfe6d6e33a09710234ba8758c7fa3b Mon Sep 17 00:00:00 2001
From: 152334H <54623771+152334H@users.noreply.github.com>
Date: Sun, 26 May 2024 00:04:53 +0800
Subject: [PATCH 2/6] gpt flops counters

---
 examples/train-gpt.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/examples/train-gpt.py b/examples/train-gpt.py
index a05587e..6f464e0 100644
--- a/examples/train-gpt.py
+++ b/examples/train-gpt.py
@@ -11,6 +11,34 @@
 d_value = 32
 num_blocks = 4
 
+GPU_16BIT_FLOPS = {
+    "h100-sxm": 1.979e15 / 2,
+    "h100-pcie": 1.513e15 / 2,
+    "a100": 312e12,
+    "v100-sxm": 125e12,
+    "6000A": 364.25e12,
+    "4090": 165.2 * 10**12,
+    "3090": 71 * 10**12,
+    "t4": 65e12,
+}
+def xf_layer_fwd_flops(slen: int, bs: int=1, causal=True) -> int:
+    p_mlp = d_embed * 4 * d_embed * 2
+    f_mlp = p_mlp * 2 * slen
+
+    assert d_query == d_value, "Dq != Dv not implemented"
+    p_att = 4 * d_embed * d_embed
+    f_att = p_att * 2 * slen
+    f_sdpa = 4 * slen * slen * d_embed // (2 if causal else 1) # approximation
+
+    return (f_mlp + f_att + f_sdpa) * bs
+
+def gpt_train_flops(slen: int, bs: int, causal=True) -> int:
+    # lmhead layer:
+    flops = 6 * slen * bs * d_embed * vocab_size
+    # assume no activation checkpointing
+    flops += num_blocks * xf_layer_fwd_flops(slen, bs, causal) * 3
+    return flops
+
 # training hparams
 
 init_lr = 0.5

From 4333a249d918b6fb97799a09ba0985291e459479 Mon Sep 17 00:00:00 2001
From: 152334H <54623771+152334H@users.noreply.github.com>
Date: Sun, 26 May 2024 00:06:42 +0800
Subject: [PATCH 3/6] compilable cuda bf16 train-gpt.py

---
 examples/train-gpt.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/examples/train-gpt.py b/examples/train-gpt.py
index 6f464e0..8f323c7 100644
--- a/examples/train-gpt.py
+++ b/examples/train-gpt.py
@@ -108,8 +108,9 @@ def __len__(self):
 
 # now let's start doing stuff
 
-if __name__ == "__main__":
 
+@torch.cuda.amp.autocast(dtype=torch.bfloat16)
+def train(device, ideal_flops_per_sec):
     # load the data
 
     trainset = SimpleLLMDataset(np.memmap("examples/data/shakespeare/train.bin", dtype=np.uint16, mode='r'), context)
@@ -124,12 +125,15 @@ def __len__(self):
     train_iterator = iter(train_loader)
     test_iterator  = iter(test_loader)
 
-    getBatch = lambda train: next(train_iterator if train else test_iterator)
+    def getBatch(train: bool) -> list:
+        res = next(train_iterator if train else test_iterator)
+        return [t.to(device=device) for t in res]
 
     # load the model
 
     gpt = GPT(vocab_size, context, num_heads, d_embed, d_query, d_value, num_blocks)
-    weights = gpt.initialize(device="cpu")
+    weights = gpt.initialize(device=device)
+    gpt.forward = torch.compile(gpt.forward)
 
     # initialize the Adam state
 
@@ -188,6 +192,18 @@ def __len__(self):
             weights.zero_grad()
 
         if step % log_interval == 0:
-            print(    "step:", step,
-                    "\t train loss:", "%.2f" % train_loss.item(), 
-                    "\t test loss:",  "%.2f" % test_loss.item()   )
+            print(
+                "step:", step,
+                "\t train loss:", "%.2f" % train_loss.item(), 
+                "\t test loss:",  "%.2f" % test_loss.item(),
+            )
+
+
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument('--cuda', action='store_true')
+    args = ap.parse_args()
+
+    torch.set_float32_matmul_precision("medium")
+    train('cuda' if args.cuda else 'cpu', GPU_16BIT_FLOPS['3090'])

From 439351d5a6ebdc62cc467e5259a7c1f8f68cbc6b Mon Sep 17 00:00:00 2001
From: 152334H <54623771+152334H@users.noreply.github.com>
Date: Sun, 26 May 2024 00:07:35 +0800
Subject: [PATCH 4/6] tokens-per-sec and model-flops-utilization logging

---
 examples/train-gpt.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/examples/train-gpt.py b/examples/train-gpt.py
index 8f323c7..5a559cb 100644
--- a/examples/train-gpt.py
+++ b/examples/train-gpt.py
@@ -1,3 +1,4 @@
+import time
 import torch
 import numpy as np
 
@@ -39,6 +40,21 @@ def gpt_train_flops(slen: int, bs: int, causal=True) -> int:
     flops += num_blocks * xf_layer_fwd_flops(slen, bs, causal) * 3
     return flops
 
+class SpeedLogger:
+    def __init__(self, ideal_flops_per_sec: float):
+        self.tps = []
+        self.mfu = []
+        self.fps = ideal_flops_per_sec
+
+    def add(self, slen: int, bs: int, duration: float) -> tuple[float,float]:
+        flops = gpt_train_flops(slen, bs)
+        self.tps.append(slen*bs / duration)
+        self.mfu.append(flops / duration / self.fps)
+        return self.tps[-1], self.mfu[-1]
+
+    def ave(self):
+        return sum(self.tps) / len(self.tps), sum(self.mfu) / len(self.mfu)
+
 # training hparams
 
 init_lr = 0.5
@@ -146,6 +162,8 @@ def getBatch(train: bool) -> list:
 
     # train the model
 
+    speed_logger = SpeedLogger(ideal_flops_per_sec)
+
     for step in range(steps):
 
         if step % log_interval == 0:
@@ -163,6 +181,7 @@ def getBatch(train: bool) -> list:
             test_loss /= eval_steps
             test_acc /= eval_steps
 
+        t0 = time.time()
         data, target = getBatch(train = True)
         output = gpt.forward(data, weights)
         output = output.view(-1, output.size(-1))
@@ -191,11 +210,16 @@ def getBatch(train: bool) -> list:
             gpt.regularize(weights, strength = init_lr * schedule * wd)
             weights.zero_grad()
 
+        speed_logger.add(*data.shape, time.time() - t0)
+
         if step % log_interval == 0:
+            tps, mfu = speed_logger.ave()
             print(
                 "step:", step,
-                "\t train loss:", "%.2f" % train_loss.item(), 
+                "\t train loss:", "%.2f" % train_loss.item(),
                 "\t test loss:",  "%.2f" % test_loss.item(),
+                f"\t tokens/gpu/sec: {tps:.2f}",
+                f"\t MFU: {mfu*100:.2f}%",
             )
 
 

From 1246cf86e75c13171262fd6b1acd0b0e3330cad6 Mon Sep 17 00:00:00 2001
From: 152334H <54623771+152334H@users.noreply.github.com>
Date: Sun, 26 May 2024 00:08:28 +0800
Subject: [PATCH 5/6] change hparams to just barely fit on 3090

---
 examples/train-gpt.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/examples/train-gpt.py b/examples/train-gpt.py
index 5a559cb..c4861ff 100644
--- a/examples/train-gpt.py
+++ b/examples/train-gpt.py
@@ -12,6 +12,15 @@
 d_value = 32
 num_blocks = 4
 
+# Llama-7b-like values, excluding the vocabulary size.
+vocab_size = 256
+context = 1024
+num_heads = 32
+d_embed = 4096
+d_query = 128
+d_value = 128
+num_blocks = 4
+
 GPU_16BIT_FLOPS = {
     "h100-sxm": 1.979e15 / 2,
     "h100-pcie": 1.513e15 / 2,
@@ -59,10 +68,10 @@ def ave(self):
 
 init_lr = 0.5
 wd = 0.01
-batch_size = 12
+batch_size = 2 # 12
 steps = 2001
 eval_steps = 100
-log_interval = 200
+log_interval = 10 # 200
 
 # let's start by defining our GPT architecture
 # (we could instead just import GPT from modula.compound)

From 37669edf7a78adcc8e7b71686d20349daf58e5bf Mon Sep 17 00:00:00 2001
From: 152334H <54623771+152334H@users.noreply.github.com>
Date: Sun, 26 May 2024 21:49:14 +0800
Subject: [PATCH 6/6] avoid logging first 2 steps && insert commented failed
 compiles

---
 examples/train-gpt.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/train-gpt.py b/examples/train-gpt.py
index c4861ff..ea327f9 100644
--- a/examples/train-gpt.py
+++ b/examples/train-gpt.py
@@ -159,6 +159,9 @@ def getBatch(train: bool) -> list:
     gpt = GPT(vocab_size, context, num_heads, d_embed, d_query, d_value, num_blocks)
     weights = gpt.initialize(device=device)
     gpt.forward = torch.compile(gpt.forward)
+    # gpt.normalize = torch.compile(gpt.normalize)
+    # gpt.regularize = torch.compile(gpt.regularize)
+    # init_lr_t = torch.tensor(init_lr, device=device)
 
     # initialize the Adam state
 
@@ -219,9 +222,11 @@ def getBatch(train: bool) -> list:
             gpt.regularize(weights, strength = init_lr * schedule * wd)
             weights.zero_grad()
 
-        speed_logger.add(*data.shape, time.time() - t0)
+        # avoid first compile && first recompile
+        if step > 1:
+            speed_logger.add(*data.shape, time.time() - t0)
 
-        if step % log_interval == 0:
+        if step > 1 and step % log_interval == 0:
             tps, mfu = speed_logger.ave()
             print(
                 "step:", step,