From ebab64ab775b340729ad3cb62ad662ab98f07e0d Mon Sep 17 00:00:00 2001
From: Avik De <avikde@gmail.com>
Date: Tue, 10 Feb 2026 15:01:33 -0500
Subject: [PATCH 1/4] Initial file

---
 scripts/xor.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 scripts/xor.py

diff --git a/scripts/xor.py b/scripts/xor.py
new file mode 100644
index 0000000..ecd9437
--- /dev/null
+++ b/scripts/xor.py
@@ -0,0 +1,49 @@
+import onnx
+from onnx import helper, TensorProto
+
+# 1. Define Inputs and Outputs
+# Input 'X' is a 1x2 vector
+X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 2])
+# Output 'Y' is a 1x1 scalar
+Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1])
+
+# 2. Define Weights and Biases (Pre-trained values for XOR)
+# Hidden layer (2 inputs -> 2 outputs)
+w1_vals = [1.0, 1.0, 1.0, 1.0] 
+b1_vals = [0.0, -1.0]
+# Output layer (2 inputs -> 1 output)
+w2_vals = [1.0, -2.0]
+b2_vals = [0.0]
+
+# Create Initializer (Weight) Tensors
+W1 = helper.make_tensor('W1', TensorProto.FLOAT, [2, 2], w1_vals)
+B1 = helper.make_tensor('B1', TensorProto.FLOAT, [2], b1_vals)
+W2 = helper.make_tensor('W2', TensorProto.FLOAT, [2, 1], w2_vals)
+B2 = helper.make_tensor('B2', TensorProto.FLOAT, [1], b2_vals)
+
+# 3. Create the Computation Nodes
+# Node 1: MatMul (X * W1)
+node1 = helper.make_node('MatMul', ['X', 'W1'], ['dot1'])
+# Node 2: Add Bias
+node2 = helper.make_node('Add', ['dot1', 'B1'], ['plus1'])
+# Node 3: ReLU
+node3 = helper.make_node('Relu', ['plus1'], ['relu1'])
+# Node 4: MatMul (relu1 * W2)
+node4 = helper.make_node('MatMul', ['relu1', 'W2'], ['dot2'])
+# Node 5: Add Bias (Final Output)
+node5 = helper.make_node('Add', ['dot2', 'B2'], ['Y'])
+
+# 4. Build the Graph and Model
+graph = helper.make_graph(
+    [node1, node2, node3, node4, node5],
+    'XOR_Network',
+    [X], [Y],
+    [W1, B1, W2, B2]
+)
+
+model = helper.make_model(graph, producer_name='onnx-example')
+
+# 5. Save the model
+onnx.save(model, 'xor_model.onnx')
+print("Model saved as xor_model.onnx")
+

From 717fdee5e03faeea82e53475925d2bcf7283adae Mon Sep 17 00:00:00 2001
From: Avik De <avikde@gmail.com>
Date: Tue, 10 Feb 2026 17:28:41 -0500
Subject: [PATCH 2/4] WIP

---
 README.md              | 17 ++++++++++++++---
 scripts/create_onnx.py |  0
 2 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 scripts/create_onnx.py

diff --git a/README.md b/README.md
index de05095..e53afa8 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,18 @@ cd build && ctest --verbose
 
 Tests produce waveform files (`*.fst`) in `test/sim_build/`. Open them in VSCode with the Surfer extension to inspect signals.
 
-## Architecture
+## Supported operations
+
+Most previous projects have focused on a matrix multiply followed by activation, mimicking Google's TPU, and aiming to solve something like the XOR classification problem.
+
+For this project, our goal was to develop something broadly applicable. We initially focused on General Matrix Multiply (GEMM); because it parameterizes the scaling constants `α` and `β` (`C = αAB + βC`), it subsumes pure matrix multiplication as a special case and enables fused multiply-accumulate patterns that avoid redundant memory writes. Many types of computations common in deep learning, including matrix multiplication, are [specializations of a GEMM operation](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html). GEMMs show up in [dense fully connected networks](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html) that are a core component of transformers and RNNs.
+
+Lastly, it is one of the [Basic Linear Algebra Subproblems (BLAS)](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) that have been the bedrock of scientific computing.
+
+## Systolic array implementation
+
+Kung paper
+- Network of PE's
 
 ### PE (`pe.sv`)
 
@@ -61,6 +72,6 @@ This is why it's called "weight-stationary" — weights move once, data flows re
 
 There are a number of "tiny TPU"-type projects, due to the current popularity of TPUs and LLMs.
 
-- [tiny-tpu-v2/tiny-tpu](https://github.com/tiny-tpu-v2/tiny-tpu/tree/main)
-- [Alanma23/tinytinyTPU](https://github.com/Alanma23/tinytinyTPU)
+- [tiny-tpu-v2/tiny-tpu](https://github.com/tiny-tpu-v2/tiny-tpu/tree/main) - 2x2 matmul + ReLU to solve XOR problem
+- [Alanma23/tinytinyTPU](https://github.com/Alanma23/tinytinyTPU) - 2x2 matmul + ReLU / ReLU6
 
diff --git a/scripts/create_onnx.py b/scripts/create_onnx.py
new file mode 100644
index 0000000..e69de29

From 6974e91b770e13754f0134570b2b0400af6e0fa4 Mon Sep 17 00:00:00 2001
From: Avik De <avikde@gmail.com>
Date: Tue, 10 Feb 2026 17:29:46 -0500
Subject: [PATCH 3/4] xor model with matmul and add

---
 scripts/create_onnx.py | 48 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/scripts/create_onnx.py b/scripts/create_onnx.py
index e69de29..7108af2 100644
--- a/scripts/create_onnx.py
+++ b/scripts/create_onnx.py
@@ -0,0 +1,48 @@
+import onnx
+from onnx import helper, TensorProto
+
+# 1. Define Inputs and Outputs
+# Input 'X' is a 1x2 vector
+X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 2])
+# Output 'Y' is a 1x1 scalar
+Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1])
+
+# 2. Define Weights and Biases (Pre-trained values for XOR)
+# Hidden layer (2 inputs -> 2 outputs)
+w1_vals = [1.0, 1.0, 1.0, 1.0] 
+b1_vals = [0.0, -1.0]
+# Output layer (2 inputs -> 1 output)
+w2_vals = [1.0, -2.0]
+b2_vals = [0.0]
+
+# Create Initializer (Weight) Tensors
+W1 = helper.make_tensor('W1', TensorProto.FLOAT, [2, 2], w1_vals)
+B1 = helper.make_tensor('B1', TensorProto.FLOAT, [2], b1_vals)
+W2 = helper.make_tensor('W2', TensorProto.FLOAT, [2, 1], w2_vals)
+B2 = helper.make_tensor('B2', TensorProto.FLOAT, [1], b2_vals)
+
+# 3. Create the Computation Nodes
+# Node 1: MatMul (X * W1)
+node1 = helper.make_node('MatMul', ['X', 'W1'], ['dot1'])
+# Node 2: Add Bias
+node2 = helper.make_node('Add', ['dot1', 'B1'], ['plus1'])
+# Node 3: ReLU
+node3 = helper.make_node('Relu', ['plus1'], ['relu1'])
+# Node 4: MatMul (relu1 * W2)
+node4 = helper.make_node('MatMul', ['relu1', 'W2'], ['dot2'])
+# Node 5: Add Bias (Final Output)
+node5 = helper.make_node('Add', ['dot2', 'B2'], ['Y'])
+
+# 4. Build the Graph and Model
+graph = helper.make_graph(
+    [node1, node2, node3, node4, node5],
+    'XOR_Network',
+    [X], [Y],
+    [W1, B1, W2, B2]
+)
+
+model = helper.make_model(graph, producer_name='onnx-example')
+
+# 5. Save the model
+onnx.save(model, 'xor_model.onnx')
+print("Model saved as xor_model.onnx")

From f8cc256feade38e2313885e150a29fc11e22429b Mon Sep 17 00:00:00 2001
From: Avik De <avikde@gmail.com>
Date: Tue, 10 Feb 2026 17:33:23 -0500
Subject: [PATCH 4/4] start with matmul

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e53afa8..aa76528 100644
--- a/README.md
+++ b/README.md
@@ -35,11 +35,11 @@ Tests produce waveform files (`*.fst`) in `test/sim_build/`. Open them in VSCode
 
 ## Supported operations
 
-Most previous projects have focused on a matrix multiply followed by activation, mimicking Google's TPU, and aiming to solve something like the XOR classification problem.
+For this project, our goal was to develop something broadly applicable. 
 
-For this project, our goal was to develop something broadly applicable. We initially focused on General Matrix Multiply (GEMM); because it parameterizes the scaling constants `α` and `β` (`C = αAB + βC`), it subsumes pure matrix multiplication as a special case and enables fused multiply-accumulate patterns that avoid redundant memory writes. Many types of computations common in deep learning, including matrix multiplication, are [specializations of a GEMM operation](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html). GEMMs show up in [dense fully connected networks](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html) that are a core component of transformers and RNNs.
+We initially focused on General Matrix Multiply (GEMM); because it parameterizes the scaling constants `α` and `β` (`C = αAB + βC`), it subsumes pure matrix multiplication as a special case and enables fused multiply-accumulate patterns that avoid redundant memory writes. Many types of computations common in deep learning, including matrix multiplication, are [specializations of a GEMM operation](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html). GEMMs show up in [dense fully connected networks](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html) that are a core component of transformers and RNNs. Lastly, it is one of the [Basic Linear Algebra Subproblems (BLAS)](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) that have been the bedrock of scientific computing.
 
-Lastly, it is one of the [Basic Linear Algebra Subproblems (BLAS)](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) that have been the bedrock of scientific computing.
+As a step toward GEMM, we add a MatMul.
 
 ## Systolic array implementation