InfiniTensor
diff --git a/‎.clang-format‎
Lines changed: 30 additions & 0 deletions b/‎.clang-format‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎.github/workflows/build.yaml‎
Lines changed: 60 additions & 0 deletions b/‎.github/workflows/build.yaml‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎include/llaisys.h‎
Lines changed: 66 additions & 0 deletions b/‎include/llaisys.h‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎include/llaisys/models/qwen2.h‎
Lines changed: 42 additions & 0 deletions b/‎include/llaisys/models/qwen2.h‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎include/llaisys/ops.h‎
Lines changed: 18 additions & 0 deletions b/‎include/llaisys/ops.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎include/llaisys/runtime.h‎
Lines changed: 47 additions & 0 deletions b/‎include/llaisys/runtime.h‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎include/llaisys/tensor.h‎
Lines changed: 68 additions & 0 deletions b/‎include/llaisys/tensor.h‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎python/llaisys/__init__.py‎
Lines changed: 20 additions & 0 deletions b/‎python/llaisys/__init__.py‎
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,30 @@
+---
+BasedOnStyle: LLVM
+IndentWidth: 4                        # 缩进宽度，LLVM 默认值为 2，改为 4
+AccessModifierOffset: -4              # public/protected/private 访问控制符相对成员的偏移，与 IndentWidth 配合，LLVM 默认值为 -2
+AlignOperands: AlignAfterOperator     # 双目运算符的行间对齐，LLVM 默认值为 Align，改为带符号一起换行
+BreakBeforeBinaryOperators: All       # 在双目运算符之前换行，LLVM 默认值为 None，改为换行时总是把双目运算符放在行首，包括赋值（=）
+ColumnLimit: 0                        # 列宽限制，LLVM 默认值为 80，改为不限制
+AllowShortBlocksOnASingleLine: Always # 是否允许短块（单个语句的块）不换行，LLVM 默认值为 Never，改为允许
+AllowShortLoopsOnASingleLine: true    # 是否允许短循环不换行，LLVM 默认值为 false，改为允许
+InsertBraces: true                    # 是否在 if/for/while/switch 等语句后插入大括号，LLVM 默认值为 false，改为允许
+BreakBeforeBraces: Custom             # 大括号换行配置，LLVM 默认值为 LLVM，改为自定义以使 BraceWrapping 生效
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterControlStatement: Never
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  BeforeLambdaBody: false
+  BeforeWhile: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
@@ -0,0 +1,60 @@
+name: Build and test
+on:
+  pull_request:
+  push:
+    paths-ignore:
+      - '**.md'
+      - 'LICENSE'
+
+jobs:
+  build:
+    name: Build
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [windows-latest, ubuntu-latest]
+        type: [release]
+    runs-on: ${{ matrix.os }}
+    steps:
+
+    - name: checkout code
+      uses: actions/checkout@v4
+
+    - name: install xmake
+      uses: xmake-io/github-action-setup-xmake@v1
+      with:
+        xmake-version: latest
+    
+    - name: Xmake Build & Install
+      run: | 
+        xmake
+        xmake install
+    
+    - name: Install Python
+      run: | 
+        cd python
+        pip install .
+        cd ..
+
+    - name: Assignment-0
+      run: |
+        python test/test_runtime.py --device cpu
+
+    - name: Assignment-1
+      run: |
+        python test/test_tensor.py
+    
+    - name: Assignment-2
+      run: |
+        python test/ops/add.py 
+        python test/ops/argmax.py
+        python test/ops/embedding.py
+        python test/ops/linear.py 
+        python test/ops/rms_norm.py
+        python test/ops/rope.py
+        python test/ops/self_attention.py
+        python test/ops/swiglu.py
+
+    - name: Assignment-3
+      run: |
+        python test/test_infer.py --test
@@ -0,0 +1,66 @@
+#ifndef __LLAISYS_H__
+#define __LLAISYS_H__
+
+#if defined(_WIN32)
+#define __export __declspec(dllexport)
+#elif defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define __export __attribute__((visibility("default")))
+#else
+#define __export
+#endif
+
+#ifdef __cplusplus
+#define __C extern "C"
+#include <cstddef>
+#include <cstdint>
+#else
+#define __C
+#include <stddef.h>
+#include <stdint.h>
+#endif
+
+// Device Types
+typedef enum {
+    LLAISYS_DEVICE_CPU = 0,
+    //// TODO: Add more device types here. Numbers need to be consecutive.
+    LLAISYS_DEVICE_NVIDIA = 1,
+    LLAISYS_DEVICE_TYPE_COUNT
+} llaisysDeviceType_t;
+
+// Data Types
+typedef enum {
+    LLAISYS_DTYPE_INVALID = 0,
+    LLAISYS_DTYPE_BYTE = 1,
+    LLAISYS_DTYPE_BOOL = 2,
+    LLAISYS_DTYPE_I8 = 3,
+    LLAISYS_DTYPE_I16 = 4,
+    LLAISYS_DTYPE_I32 = 5,
+    LLAISYS_DTYPE_I64 = 6,
+    LLAISYS_DTYPE_U8 = 7,
+    LLAISYS_DTYPE_U16 = 8,
+    LLAISYS_DTYPE_U32 = 9,
+    LLAISYS_DTYPE_U64 = 10,
+    LLAISYS_DTYPE_F8 = 11,
+    LLAISYS_DTYPE_F16 = 12,
+    LLAISYS_DTYPE_F32 = 13,
+    LLAISYS_DTYPE_F64 = 14,
+    LLAISYS_DTYPE_C16 = 15,
+    LLAISYS_DTYPE_C32 = 16,
+    LLAISYS_DTYPE_C64 = 17,
+    LLAISYS_DTYPE_C128 = 18,
+    LLAISYS_DTYPE_BF16 = 19,
+} llaisysDataType_t;
+
+// Runtime Types
+// Stream
+typedef void *llaisysStream_t;
+
+// Memory Copy Directions
+typedef enum {
+    LLAISYS_MEMCPY_H2H = 0,
+    LLAISYS_MEMCPY_H2D = 1,
+    LLAISYS_MEMCPY_D2H = 2,
+    LLAISYS_MEMCPY_D2D = 3,
+} llaisysMemcpyKind_t;
+
+#endif // __LLAISYS_H__
@@ -0,0 +1,42 @@
+#ifndef LLAISYS_MODELS_QWEN2_H
+#define LLAISYS_MODELS_QWEN2_H
+
+#include "../tensor.h"
+
+__C {
+    struct LlaisysQwen2Meta {
+        llaisysDataType_t dtype;
+        size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc;
+        float epsilon, theta;
+        int64_t end_token;
+    };
+
+    struct LlaisysQwen2Weights {
+        llaisysTensor_t in_embed;
+        llaisysTensor_t out_embed;
+        llaisysTensor_t out_norm_w;   // a.k.a. model.norm.weight
+        llaisysTensor_t *attn_norm_w; // a.k.a. input_layernorm.weight
+        llaisysTensor_t *attn_q_w;
+        llaisysTensor_t *attn_q_b;
+        llaisysTensor_t *attn_k_w;
+        llaisysTensor_t *attn_k_b;
+        llaisysTensor_t *attn_v_w;
+        llaisysTensor_t *attn_v_b;
+        llaisysTensor_t *attn_o_w;
+        llaisysTensor_t *mlp_norm_w; // a.k.a. post_attention_layernorm.weight
+        llaisysTensor_t *mlp_gate_w;
+        llaisysTensor_t *mlp_up_w;
+        llaisysTensor_t *mlp_down_w;
+    };
+
+    struct LlaisysQwen2Model;
+
+    __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
+
+    __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
+
+    __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
+
+    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+}
+#endif // LLAISYS_MODELS_QWEN2_H
@@ -0,0 +1,18 @@
+#ifndef LLAISYS_OPS_H
+#define LLAISYS_OPS_H
+
+#include "tensor.h"
+
+__C {
+    __export void llaisysAdd(llaisysTensor_t c, llaisysTensor_t a, llaisysTensor_t b);
+    __export void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals);
+    __export void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight);
+    __export void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias);
+    __export void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in);
+    __export void llaisysRmsNorm(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, float eps);
+    __export void llaisysROPE(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t pos_ids, float theta);
+    __export void llaisysSelfAttention(llaisysTensor_t attn_val, llaisysTensor_t q, llaisysTensor_t k, llaisysTensor_t v, float scale);
+    __export void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up);
+}
+
+#endif
@@ -0,0 +1,47 @@
+#ifndef LLAISYS_RUNTIME_H
+#define LLAISYS_RUNTIME_H
+
+#include "../llaisys.h"
+
+__C {
+    // Runtime API Functions
+    // Device
+    typedef int (*get_device_count_api)();
+    typedef void (*set_device_api)(int);
+    typedef void (*device_synchronize_api)();
+    // Stream
+    typedef llaisysStream_t (*create_stream_api)();
+    typedef void (*destroy_stream_api)(llaisysStream_t);
+    typedef void (*stream_synchronize_api)(llaisysStream_t);
+    // Memory
+    typedef void *(*malloc_device_api)(size_t);
+    typedef void (*free_device_api)(void *);
+    typedef void *(*malloc_host_api)(size_t);
+    typedef void (*free_host_api)(void *);
+    // Memory copy
+    typedef void (*memcpy_sync_api)(void *, const void *, size_t, llaisysMemcpyKind_t);
+    typedef void (*memcpy_async_api)(void *, const void *, size_t, llaisysMemcpyKind_t, llaisysStream_t);
+
+    struct LlaisysRuntimeAPI {
+        get_device_count_api get_device_count;
+        set_device_api set_device;
+        device_synchronize_api device_synchronize;
+        create_stream_api create_stream;
+        destroy_stream_api destroy_stream;
+        stream_synchronize_api stream_synchronize;
+        malloc_device_api malloc_device;
+        free_device_api free_device;
+        malloc_host_api malloc_host;
+        free_host_api free_host;
+        memcpy_sync_api memcpy_sync;
+        memcpy_async_api memcpy_async;
+    };
+
+    // Llaisys API for getting the runtime APIs
+    __export const LlaisysRuntimeAPI *llaisysGetRuntimeAPI(llaisysDeviceType_t);
+
+    // Llaisys API for switching device context
+    __export void llaisysSetContextRuntime(llaisysDeviceType_t, int);
+}
+
+#endif // LLAISYS_RUNTIME_H
@@ -0,0 +1,68 @@
+#ifndef LLAISYS_TENSOR_H
+#define LLAISYS_TENSOR_H
+
+#include "../llaisys.h"
+
+__C {
+    typedef struct LlaisysTensor *llaisysTensor_t;
+
+    __export llaisysTensor_t tensorCreate(
+        size_t * shape,
+        size_t ndim,
+        llaisysDataType_t dtype,
+        llaisysDeviceType_t device_type,
+        int device_id);
+
+    __export void tensorDestroy(
+        llaisysTensor_t tensor);
+
+    __export void *tensorGetData(
+        llaisysTensor_t tensor);
+
+    __export size_t tensorGetNdim(
+        llaisysTensor_t tensor);
+
+    __export void tensorGetShape(
+        llaisysTensor_t tensor,
+        size_t * shape);
+
+    __export void tensorGetStrides(
+        llaisysTensor_t tensor,
+        ptrdiff_t * strides);
+
+    __export llaisysDataType_t tensorGetDataType(
+        llaisysTensor_t tensor);
+
+    __export llaisysDeviceType_t tensorGetDeviceType(
+        llaisysTensor_t tensor);
+
+    __export int tensorGetDeviceId(
+        llaisysTensor_t tensor);
+
+    __export void tensorDebug(
+        llaisysTensor_t tensor);
+
+    __export uint8_t tensorIsContiguous(
+        llaisysTensor_t tensor);
+
+    __export void tensorLoad(
+        llaisysTensor_t tensor,
+        const void *data);
+
+    __export llaisysTensor_t tensorView(
+        llaisysTensor_t tensor,
+        size_t * shape,
+        size_t ndim);
+
+    __export llaisysTensor_t tensorPermute(
+        llaisysTensor_t tensor,
+        size_t * order);
+
+    __export llaisysTensor_t tensorSlice(
+        llaisysTensor_t tensor,
+        size_t dim,
+        size_t start,
+        size_t end);
+}
+
+#endif // LLAISYS_TENSOR_H
@@ -0,0 +1,20 @@
+from .runtime import RuntimeAPI
+from .libllaisys import DeviceType
+from .libllaisys import DataType
+from .libllaisys import MemcpyKind
+from .libllaisys import llaisysStream_t as Stream
+from .tensor import Tensor
+from .ops import Ops
+from . import models
+from .models import *
+
+__all__ = [
+    "RuntimeAPI",
+    "DeviceType",
+    "DataType",
+    "MemcpyKind",
+    "Stream",
+    "Tensor",
+    "Ops",
+    "models",
+]