From ebab64ab775b340729ad3cb62ad662ab98f07e0d Mon Sep 17 00:00:00 2001 From: Avik De Date: Tue, 10 Feb 2026 15:01:33 -0500 Subject: [PATCH 1/4] Initial file --- scripts/xor.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 scripts/xor.py diff --git a/scripts/xor.py b/scripts/xor.py new file mode 100644 index 0000000..ecd9437 --- /dev/null +++ b/scripts/xor.py @@ -0,0 +1,49 @@ +import onnx +from onnx import helper, TensorProto + +# 1. Define Inputs and Outputs +# Input 'X' is a 1x2 vector +X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 2]) +# Output 'Y' is a 1x1 scalar +Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1]) + +# 2. Define Weights and Biases (Pre-trained values for XOR) +# Hidden layer (2 inputs -> 2 outputs) +w1_vals = [1.0, 1.0, 1.0, 1.0] +b1_vals = [0.0, -1.0] +# Output layer (2 inputs -> 1 output) +w2_vals = [1.0, -2.0] +b2_vals = [0.0] + +# Create Initializer (Weight) Tensors +W1 = helper.make_tensor('W1', TensorProto.FLOAT, [2, 2], w1_vals) +B1 = helper.make_tensor('B1', TensorProto.FLOAT, [2], b1_vals) +W2 = helper.make_tensor('W2', TensorProto.FLOAT, [2, 1], w2_vals) +B2 = helper.make_tensor('B2', TensorProto.FLOAT, [1], b2_vals) + +# 3. Create the Computation Nodes +# Node 1: MatMul (X * W1) +node1 = helper.make_node('MatMul', ['X', 'W1'], ['dot1']) +# Node 2: Add Bias +node2 = helper.make_node('Add', ['dot1', 'B1'], ['plus1']) +# Node 3: ReLU +node3 = helper.make_node('Relu', ['plus1'], ['relu1']) +# Node 4: MatMul (relu1 * W2) +node4 = helper.make_node('MatMul', ['relu1', 'W2'], ['dot2']) +# Node 5: Add Bias (Final Output) +node5 = helper.make_node('Add', ['dot2', 'B2'], ['Y']) + +# 4. Build the Graph and Model +graph = helper.make_graph( + [node1, node2, node3, node4, node5], + 'XOR_Network', + [X], [Y], + [W1, B1, W2, B2] +) + +model = helper.make_model(graph, producer_name='onnx-example') + +# 5. Save the model +onnx.save(model, 'xor_model.onnx') +print("Model saved as xor_model.onnx") + From 717fdee5e03faeea82e53475925d2bcf7283adae Mon Sep 17 00:00:00 2001 From: Avik De Date: Tue, 10 Feb 2026 17:28:41 -0500 Subject: [PATCH 2/4] WIP --- README.md | 17 ++++++++++++++--- scripts/create_onnx.py | 0 2 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 scripts/create_onnx.py diff --git a/README.md b/README.md index de05095..e53afa8 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,18 @@ cd build && ctest --verbose Tests produce waveform files (`*.fst`) in `test/sim_build/`. Open them in VSCode with the Surfer extension to inspect signals. -## Architecture +## Supported operations + +Most previous projects have focused on a matrix multiply followed by activation, mimicking Google's TPU, and aiming to solve something like the XOR classification problem. + +For this project, our goal was to develop something broadly applicable. We initially focused on General Matrix Multiply (GEMM); because it parameterizes the scaling constants `α` and `β` (`C = αAB + βC`), it subsumes pure matrix multiplication as a special case and enables fused multiply-accumulate patterns that avoid redundant memory writes. Many types of computations common in deep learning, including matrix multiplication, are [specializations of a GEMM operation](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html). GEMMs show up in [dense fully connected networks](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html) that are a core component of transformers and RNNs. + +Lastly, it is one of the [Basic Linear Algebra Subproblems (BLAS)](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) that have been the bedrock of scientific computing. + +## Systolic array implementation + +Kung paper +- Network of PE's ### PE (`pe.sv`) @@ -61,6 +72,6 @@ This is why it's called "weight-stationary" — weights move once, data flows re There are a number of "tiny TPU"-type projects, due to the current popularity of TPUs and LLMs. -- [tiny-tpu-v2/tiny-tpu](https://github.com/tiny-tpu-v2/tiny-tpu/tree/main) -- [Alanma23/tinytinyTPU](https://github.com/Alanma23/tinytinyTPU) +- [tiny-tpu-v2/tiny-tpu](https://github.com/tiny-tpu-v2/tiny-tpu/tree/main) - 2x2 matmul + ReLU to solve XOR problem +- [Alanma23/tinytinyTPU](https://github.com/Alanma23/tinytinyTPU) - 2x2 matmul + ReLU / ReLU6 diff --git a/scripts/create_onnx.py b/scripts/create_onnx.py new file mode 100644 index 0000000..e69de29 From 6974e91b770e13754f0134570b2b0400af6e0fa4 Mon Sep 17 00:00:00 2001 From: Avik De Date: Tue, 10 Feb 2026 17:29:46 -0500 Subject: [PATCH 3/4] xor model with matmul and add --- scripts/create_onnx.py | 48 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/scripts/create_onnx.py b/scripts/create_onnx.py index e69de29..7108af2 100644 --- a/scripts/create_onnx.py +++ b/scripts/create_onnx.py @@ -0,0 +1,48 @@ +import onnx +from onnx import helper, TensorProto + +# 1. Define Inputs and Outputs +# Input 'X' is a 1x2 vector +X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 2]) +# Output 'Y' is a 1x1 scalar +Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1]) + +# 2. Define Weights and Biases (Pre-trained values for XOR) +# Hidden layer (2 inputs -> 2 outputs) +w1_vals = [1.0, 1.0, 1.0, 1.0] +b1_vals = [0.0, -1.0] +# Output layer (2 inputs -> 1 output) +w2_vals = [1.0, -2.0] +b2_vals = [0.0] + +# Create Initializer (Weight) Tensors +W1 = helper.make_tensor('W1', TensorProto.FLOAT, [2, 2], w1_vals) +B1 = helper.make_tensor('B1', TensorProto.FLOAT, [2], b1_vals) +W2 = helper.make_tensor('W2', TensorProto.FLOAT, [2, 1], w2_vals) +B2 = helper.make_tensor('B2', TensorProto.FLOAT, [1], b2_vals) + +# 3. Create the Computation Nodes +# Node 1: MatMul (X * W1) +node1 = helper.make_node('MatMul', ['X', 'W1'], ['dot1']) +# Node 2: Add Bias +node2 = helper.make_node('Add', ['dot1', 'B1'], ['plus1']) +# Node 3: ReLU +node3 = helper.make_node('Relu', ['plus1'], ['relu1']) +# Node 4: MatMul (relu1 * W2) +node4 = helper.make_node('MatMul', ['relu1', 'W2'], ['dot2']) +# Node 5: Add Bias (Final Output) +node5 = helper.make_node('Add', ['dot2', 'B2'], ['Y']) + +# 4. Build the Graph and Model +graph = helper.make_graph( + [node1, node2, node3, node4, node5], + 'XOR_Network', + [X], [Y], + [W1, B1, W2, B2] +) + +model = helper.make_model(graph, producer_name='onnx-example') + +# 5. Save the model +onnx.save(model, 'xor_model.onnx') +print("Model saved as xor_model.onnx") From f8cc256feade38e2313885e150a29fc11e22429b Mon Sep 17 00:00:00 2001 From: Avik De Date: Tue, 10 Feb 2026 17:33:23 -0500 Subject: [PATCH 4/4] start with matmul --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e53afa8..aa76528 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,11 @@ Tests produce waveform files (`*.fst`) in `test/sim_build/`. Open them in VSCode ## Supported operations -Most previous projects have focused on a matrix multiply followed by activation, mimicking Google's TPU, and aiming to solve something like the XOR classification problem. +For this project, our goal was to develop something broadly applicable. -For this project, our goal was to develop something broadly applicable. We initially focused on General Matrix Multiply (GEMM); because it parameterizes the scaling constants `α` and `β` (`C = αAB + βC`), it subsumes pure matrix multiplication as a special case and enables fused multiply-accumulate patterns that avoid redundant memory writes. Many types of computations common in deep learning, including matrix multiplication, are [specializations of a GEMM operation](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html). GEMMs show up in [dense fully connected networks](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html) that are a core component of transformers and RNNs. +We initially focused on General Matrix Multiply (GEMM); because it parameterizes the scaling constants `α` and `β` (`C = αAB + βC`), it subsumes pure matrix multiplication as a special case and enables fused multiply-accumulate patterns that avoid redundant memory writes. Many types of computations common in deep learning, including matrix multiplication, are [specializations of a GEMM operation](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html). GEMMs show up in [dense fully connected networks](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html) that are a core component of transformers and RNNs. Lastly, it is one of the [Basic Linear Algebra Subproblems (BLAS)](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) that have been the bedrock of scientific computing. -Lastly, it is one of the [Basic Linear Algebra Subproblems (BLAS)](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) that have been the bedrock of scientific computing. +As a step toward GEMM, we add a MatMul. ## Systolic array implementation