Add MLIR support (#1044)

lutzroeder · lutzroeder · commit e40469dbfdc3 · 2025-12-04T09:20:50.000-08:00
diff --git a/source/mlir-metadata.json b/source/mlir-metadata.json
@@ -23436,7 +23436,7 @@
   },
   {
     "name": "nvvm.fence.mbarrier.init",
-    "description": "Fence operation that applies on the prior nvvm.mbarrier.init\n    \n    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)",
+    "description": "Fence operation that applies on the prior nvvm.mbarrier.init\n\n    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)",
     "assemblyFormat": "attr-dict"
   },
   {
@@ -23474,10 +23474,30 @@
     ],
     "assemblyFormat": "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict"
   },
+  {
+    "name": "nvvm.fence.proxy.sync_restrict",
+    "summary": "Uni-directional proxy fence operation with sync_restrict",
+    "description": "The `nvvm.fence.proxy.sync_restrict` Op used to establish\n    ordering between a prior memory access performed between proxies. Currently,\n    the ordering is only supported between async and generic proxies. `sync_restrict`\n    restricts `acquire` memory semantics to `shared_cluster` and `release` memory\n    semantics to `shared_cta` with cluster scope.\n    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)",
+    "attributes": [
+      { "name": "order", "type": "MemOrderKindAttr{weak|relaxed|acquire|release|acq_rel|sc|mmio|volatile}" },
+      { "name": "fromProxy", "type": "DefaultValuedAttr<ProxyKindAttr{alias|async|async.global|async.shared|tensormap|generic}, ProxyKind::GENERIC>" },
+      { "name": "toProxy", "type": "DefaultValuedAttr<ProxyKindAttr{alias|async|async.global|async.shared|tensormap|generic}, ProxyKind::async>" }
+    ],
+    "assemblyFormat": "attr-dict"
+  },
   {
     "name": "nvvm.fence.sc.cluster",
     "assemblyFormat": "attr-dict"
   },
+  {
+    "name": "nvvm.fence.sync_restrict",
+    "summary": "Uni-directional thread fence operation",
+    "description": "The `nvvm.fence.sync_restrict` Op restricts the class of memory\n    operations for which the fence instruction provides the memory ordering guarantees.\n    `sync_restrict` restricts `acquire` memory semantics to `shared_cluster` and\n    `release` memory semantics to `shared_cta` with cluster scope.\n    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)",
+    "attributes": [
+      { "name": "order", "type": "MemOrderKindAttr{weak|relaxed|acquire|release|acq_rel|sc|mmio|volatile}" }
+    ],
+    "assemblyFormat": "attr-dict"
+  },
   {
     "name": "nvvm.griddepcontrol",
     "description": "If the $kind attribute is set to `wait`, it causes the \n    executing thread to wait until all prerequisite grids in flight \n    have completed and all the memory operations from the prerequisite grids \n    are performed and made visible to the current grid.\n\n    When the $kind is launch_dependents, it signals that specific dependents \n    the runtime system designated to react to this instruction can be scheduled \n    as soon as all other CTAs in the grid issue the same instruction or have \n    completed.\n\n    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol)",
@@ -23705,6 +23725,24 @@
     ],
     "assemblyFormat": "$addr `,` $stateOrPhase attr-dict `:` type(operands) `->` type($res)"
   },
+  {
+    "name": "nvvm.mbarrier.try_wait",
+    "summary": "MBarrier try wait on state or phase with an optional timelimit",
+    "description": "The `nvvm.mbarrier.try_wait` operation checks whether the specified\n    *mbarrier object* at `addr` has completed the given phase. Note that\n    unlike the `nvvm.mbarrier.test.wait` operation, the try_wait operation\n    is a potentially-blocking one. If the phase is not yet complete, the\n    calling thread may be suspended. A suspended thread resumes execution\n    once the phase completes or when a system-defined timeout occurs.\n    Optionally, the `ticks` operand can be used to provide a custom timeout\n    (in nanoseconds), overriding the system-defined one. The semantics of\n    this operation and its operands are otherwise similar to those of the\n    `nvvm.mbarrier.test.wait` Op.\n\n    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)",
+    "inputs": [
+      { "name": "addr", "type": "AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>" },
+      { "name": "stateOrPhase", "type": "AnyTypeOf<[I64, I32]>" },
+      { "name": "ticks", "type": "Optional<I32>" }
+    ],
+    "outputs": [
+      { "name": "res", "type": "I1" }
+    ],
+    "attributes": [
+      { "name": "scope", "type": "DefaultValuedAttr<MemScopeKindAttr{cta|cluster|gpu|sys}, MemScopeKind::CTA>" },
+      { "name": "relaxed", "type": "DefaultValuedAttr<BoolAttr, false>" }
+    ],
+    "assemblyFormat": "$addr `,` $stateOrPhase (`,` $ticks^)? attr-dict `:` type(operands) `->` type($res)"
+  },
   {
     "name": "nvvm.mbarrier.try_wait.parity",
     "summary": "MBarrier Potentially-Blocking Try Wait with Phase Parity",
@@ -43976,6 +44014,27 @@
     "assemblyFormat": "`(` $args `)` `waits` `(` $wait_fence_likes `)` `signals` `(` $signal_fence_like `)`\n    attr-dict `:` functional-type($args, $results)",
     "hasCustomAssemblyFormat": true
   },
+  {
+    "name": "stream.test.timeline_op",
+    "summary": "Test op implementing TimelineOpInterface.",
+    "description": "Test-only operation that implements TimelineOpInterface directly,\n    allowing Stream dialect tests to verify timeline behavior without\n    requiring execution regions or specific op semantics.\n\n    This op takes zero or more resource operands with their sizes, zero or more\n    await timepoints, and produces zero or more resource results with a result\n    timepoint. It has no side effects and no execution region - it purely exists\n    to test timepoint propagation, coverage analysis, and timeline scheduling.\n\n    The `await_limit` attribute (default -1) controls how many await timepoints\n    this op can accept before forcing join creation:\n    - await_limit = -1: Unlimited awaits (default, no joins created)\n    - await_limit = 0: No awaits allowed (all timepoints must be joined)\n    - await_limit = 1: Max 1 await (mimics stream.async.execute behavior)\n    - await_limit = N: Max N awaits\n\n    Use this op when testing passes that operate on TimelineOpInterface methods.\n    It's simpler than stream.async.execute (no execution region) but can still\n    model resource-timepoint flows. For example:\n    - ElideTimepointsPass coverage analysis (elide_timepoints_coverage.mlir)\n    - PropagateTimepointsPass scheduling (propagate_timepoints.mlir)\n    - SCF region handling with resources (elide_timepoints_scf.mlir)\n\n    Examples:\n      // Simple timeline op with no dependencies or resources.\n      %tp0 = stream.test.timeline_op with() : () -> () => !stream.timepoint\n\n      // Timeline op awaiting other timepoints.\n      %tp1 = stream.test.timeline_op await(%tp0) => with() : () -> () => !stream.timepoint\n\n      // Timeline op with resource operands and results.\n      %r, %tp = stream.test.timeline_op\n        with(%arg) : (!stream.resource<external>{%size}) -> !stream.resource<external>{%size}\n        => !stream.timepoint\n\n      // Timeline op with await_limit to test join creation.\n      %r, %tp = stream.test.timeline_op await_limit(1) await(%tp0) =>\n        with(%arg) : (!stream.resource<external>{%size}) -> !stream.resource<external>{%size}\n        => !stream.timepoint",
+    "inputs": [
+      { "name": "resource_operands", "type": "Variadic<AnyTypeOf<[ Stream_AnyStreamResource, Stream_StagingResource, ]>>" },
+      { "name": "resource_operand_sizes", "type": "Variadic<Stream_Size>" },
+      { "name": "result_sizes", "type": "Variadic<Stream_Size>" },
+      { "name": "await_operands", "type": "Variadic<Stream_Timepoint>" }
+    ],
+    "outputs": [
+      { "name": "results", "type": "Variadic<AnyTypeOf<[ Stream_AnyStreamResource, Stream_StagingResource, ]>>" },
+      { "name": "result_timepoint", "type": "Stream_Timepoint" }
+    ],
+    "attributes": [
+      { "name": "tied_operands", "type": "OptionalAttr<Util_TiedOpStorageAttr>" },
+      { "name": "await_limit", "type": "OptionalAttr<I64Attr>" }
+    ],
+    "assemblyFormat": "(`await_limit` `(` $await_limit^ `)`)?\n    (`await` `(` $await_operands^ `)` `=` `` `>`)?\n    `with` `(` $resource_operands `)` attr-dict `:`\n    custom<ShapedFunctionType>(ref($resource_operands),\n                               type($resource_operands), $resource_operand_sizes,\n                               type($results), $result_sizes,\n                               $tied_operands)\n    `=` `` `>` type($result_timepoint)",
+    "hasCustomAssemblyFormat": true
+  },
   {
     "name": "stream.timepoint.await",
     "summary": "Awaits a timepoint before returning a set of resources.",
diff --git a/source/mlir.js b/source/mlir.js
@@ -3127,6 +3127,33 @@ mlir.FunctionType = class extends mlir.Type {
     }
 };
 
+mlir.LLVMFunctionType = class extends mlir.Type {
+
+    constructor(returnType, params, varArg = false) {
+        super(null);
+        this.returnType = returnType;
+        this.params = params || [];
+        this.varArg = varArg;
+    }
+
+    get inputs() {
+        return this.params;
+    }
+
+    get results() {
+        return this.returnType ? [this.returnType] : [];
+    }
+
+    toString() {
+        const params = this.params.map((t) => t.toString());
+        if (this.varArg) {
+            params.push('...');
+        }
+        const returnType = this.returnType ? this.returnType.toString() : 'void';
+        return `!llvm.func<${returnType} (${params.join(', ')})>`;
+    }
+};
+
 mlir.Utility = class {
 
     static dataType(value) {
@@ -11794,17 +11821,15 @@ mlir.LLVMDialect = class extends mlir.Dialect {
             op.attributes.push({ name: 'CConv', value: parser.expect('id') });
         }
         parser.parseSymbolName('sym_name', op.attributes);
-        const type = {};
         const argResult = parser.parseFunctionArgumentList(true);
-        type.inputs = argResult.arguments.map((a) => a.type);
-        if (argResult.isVariadic) {
-            op.attributes.push({ name: 'var_arg_', value: true });
-        }
-        type.results = [];
+        const params = argResult.arguments.map((a) => a.type);
+        const results = [];
         const resultAttrs = [];
         if (parser.accept('->')) {
-            parser.parseFunctionResultList(type.results, resultAttrs);
+            parser.parseFunctionResultList(results, resultAttrs);
         }
+        const returnType = results.length > 0 ? results[0] : null;
+        const type = new mlir.LLVMFunctionType(returnType, params, argResult.isVariadic);
         op.attributes.push({ name: 'function_type', value: type });
         if (parser.accept('id', 'vscale_range')) {
             parser.expect('(');
diff --git a/tools/mlir b/tools/mlir
@@ -59,7 +59,7 @@ schema() {
 
 test() {
     echo "mlir test"
-    node ./tools/mlir_script.js test $@
+    node ./tools/mlir_script.js test "$@"
 }
 
 while [ "$#" != 0 ]; do
@@ -68,6 +68,6 @@ while [ "$#" != 0 ]; do
         "clean") clean;;
         "sync") sync;;
         "schema") schema;;
-        "test") test $@;;
+        "test") test "$@";;
     esac
 done
diff --git a/tools/mlir_script.js b/tools/mlir_script.js