avikde · avikde · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/README.md b/README.md
@@ -33,7 +33,18 @@ cd build && ctest --verbose
 
 Tests produce waveform files (`*.fst`) in `test/sim_build/`. Open them in VSCode with the Surfer extension to inspect signals.
 
-## Architecture
+## Supported operations
+
+For this project, our goal was to develop something broadly applicable. 
+
+We initially focused on General Matrix Multiply (GEMM); because it parameterizes the scaling constants `α` and `β` (`C = αAB + βC`), it subsumes pure matrix multiplication as a special case and enables fused multiply-accumulate patterns that avoid redundant memory writes. Many types of computations common in deep learning, including matrix multiplication, are [specializations of a GEMM operation](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html). GEMMs show up in [dense fully connected networks](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html) that are a core component of transformers and RNNs. Lastly, it is one of the [Basic Linear Algebra Subproblems (BLAS)](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) that have been the bedrock of scientific computing.
+
+As a step toward GEMM, we add a MatMul.
+
+## Systolic array implementation
+
+Kung paper
+- Network of PE's
 
 ### PE (`pe.sv`)
 
@@ -61,6 +72,6 @@ This is why it's called "weight-stationary" — weights move once, data flows re
 
 There are a number of "tiny TPU"-type projects, due to the current popularity of TPUs and LLMs.
 
-- [tiny-tpu-v2/tiny-tpu](https://github.com/tiny-tpu-v2/tiny-tpu/tree/main)
-- [Alanma23/tinytinyTPU](https://github.com/Alanma23/tinytinyTPU)
+- [tiny-tpu-v2/tiny-tpu](https://github.com/tiny-tpu-v2/tiny-tpu/tree/main) - 2x2 matmul + ReLU to solve XOR problem
+- [Alanma23/tinytinyTPU](https://github.com/Alanma23/tinytinyTPU) - 2x2 matmul + ReLU / ReLU6
 
diff --git a/scripts/create_onnx.py b/scripts/create_onnx.py
@@ -0,0 +1,48 @@
+import onnx
+from onnx import helper, TensorProto
+
+# 1. Define Inputs and Outputs
+# Input 'X' is a 1x2 vector
+X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 2])
+# Output 'Y' is a 1x1 scalar
+Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1])
+
+# 2. Define Weights and Biases (Pre-trained values for XOR)
+# Hidden layer (2 inputs -> 2 outputs)
+w1_vals = [1.0, 1.0, 1.0, 1.0] 
+b1_vals = [0.0, -1.0]
+# Output layer (2 inputs -> 1 output)
+w2_vals = [1.0, -2.0]
+b2_vals = [0.0]
+
+# Create Initializer (Weight) Tensors
+W1 = helper.make_tensor('W1', TensorProto.FLOAT, [2, 2], w1_vals)
+B1 = helper.make_tensor('B1', TensorProto.FLOAT, [2], b1_vals)
+W2 = helper.make_tensor('W2', TensorProto.FLOAT, [2, 1], w2_vals)
+B2 = helper.make_tensor('B2', TensorProto.FLOAT, [1], b2_vals)
+
+# 3. Create the Computation Nodes
+# Node 1: MatMul (X * W1)
+node1 = helper.make_node('MatMul', ['X', 'W1'], ['dot1'])
+# Node 2: Add Bias
+node2 = helper.make_node('Add', ['dot1', 'B1'], ['plus1'])
+# Node 3: ReLU
+node3 = helper.make_node('Relu', ['plus1'], ['relu1'])
+# Node 4: MatMul (relu1 * W2)
+node4 = helper.make_node('MatMul', ['relu1', 'W2'], ['dot2'])
+# Node 5: Add Bias (Final Output)
+node5 = helper.make_node('Add', ['dot2', 'B2'], ['Y'])
+
+# 4. Build the Graph and Model
+graph = helper.make_graph(
+    [node1, node2, node3, node4, node5],
+    'XOR_Network',
+    [X], [Y],
+    [W1, B1, W2, B2]
+)
+
+model = helper.make_model(graph, producer_name='onnx-example')
+
+# 5. Save the model
+onnx.save(model, 'xor_model.onnx')
+print("Model saved as xor_model.onnx")
diff --git a/scripts/xor.py b/scripts/xor.py
@@ -0,0 +1,49 @@
+import onnx
+from onnx import helper, TensorProto
+
+# 1. Define Inputs and Outputs
+# Input 'X' is a 1x2 vector
+X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 2])
+# Output 'Y' is a 1x1 scalar
+Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1])
+
+# 2. Define Weights and Biases (Pre-trained values for XOR)
+# Hidden layer (2 inputs -> 2 outputs)
+w1_vals = [1.0, 1.0, 1.0, 1.0] 
+b1_vals = [0.0, -1.0]
+# Output layer (2 inputs -> 1 output)
+w2_vals = [1.0, -2.0]
+b2_vals = [0.0]
+
+# Create Initializer (Weight) Tensors
+W1 = helper.make_tensor('W1', TensorProto.FLOAT, [2, 2], w1_vals)
+B1 = helper.make_tensor('B1', TensorProto.FLOAT, [2], b1_vals)
+W2 = helper.make_tensor('W2', TensorProto.FLOAT, [2, 1], w2_vals)
+B2 = helper.make_tensor('B2', TensorProto.FLOAT, [1], b2_vals)
+
+# 3. Create the Computation Nodes
+# Node 1: MatMul (X * W1)
+node1 = helper.make_node('MatMul', ['X', 'W1'], ['dot1'])
+# Node 2: Add Bias
+node2 = helper.make_node('Add', ['dot1', 'B1'], ['plus1'])
+# Node 3: ReLU
+node3 = helper.make_node('Relu', ['plus1'], ['relu1'])
+# Node 4: MatMul (relu1 * W2)
+node4 = helper.make_node('MatMul', ['relu1', 'W2'], ['dot2'])
+# Node 5: Add Bias (Final Output)
+node5 = helper.make_node('Add', ['dot2', 'B2'], ['Y'])
+
+# 4. Build the Graph and Model
+graph = helper.make_graph(
+    [node1, node2, node3, node4, node5],
+    'XOR_Network',
+    [X], [Y],
+    [W1, B1, W2, B2]
+)
+
+model = helper.make_model(graph, producer_name='onnx-example')
+
+# 5. Save the model
+onnx.save(model, 'xor_model.onnx')
+print("Model saved as xor_model.onnx")
+