Merge branch 'main' into export-D88661769

Gasoonjia · web-flow · commit 9a1cda40253d · 2025-12-08T18:05:21.000-08:00
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
@@ -641,6 +641,9 @@ python_unittest(
     typing = True,
     deps = [
         "//caffe2:torch",
+        "//executorch/backends/cadence/aot:graph_builder",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
+        "//executorch/exir:pass_base",
+        "//pytorch/ao:torchao",
     ],
 )
diff --git a/backends/cadence/aot/tests/test_quantizer_ops.py b/backends/cadence/aot/tests/test_quantizer_ops.py
@@ -9,14 +9,64 @@
 import unittest
 
 import torch
+from executorch.backends.cadence.aot.graph_builder import GraphBuilder
 from executorch.backends.cadence.aot.quantizer.patterns import AddmmPattern
 
 from executorch.backends.cadence.aot.quantizer.quantizer import (
     CadenceAtenQuantizer,
     CadenceDefaultQuantizer,
     CadenceW8A32MixedQuantizer,
+    CadenceWith16BitMatmulActivationsQuantizer,
+    qconfig_A16,
     qconfig_A8W8,
 )
+from executorch.exir.pass_base import NodeMetadata
+from torchao.quantization.pt2e.quantizer.quantizer import (
+    Q_ANNOTATION_KEY,
+    QuantizationAnnotation,
+)
+
+
+class QuantizerAnnotationTest(unittest.TestCase):
+    """Unit tests for verifying quantizer annotations are correctly applied."""
+
+    def _build_matmul_graph(self) -> tuple[torch.fx.GraphModule, torch.fx.Node]:
+        """Build a simple graph with a matmul operation."""
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 8))
+        y = builder.placeholder("y", torch.randn(8, 4))
+        matmul = builder.call_operator(
+            op=torch.ops.aten.matmul.default,
+            args=(x, y),
+            meta=NodeMetadata(
+                {"source_fn_stack": [("matmul", torch.ops.aten.matmul.default)]}
+            ),
+        )
+        builder.output([matmul])
+        gm = builder.get_graph_module()
+
+        matmul_nodes = gm.graph.find_nodes(
+            op="call_function",
+            target=torch.ops.aten.matmul.default,
+        )
+        self.assertEqual(len(matmul_nodes), 1, "Should find exactly one matmul node")
+        return gm, matmul_nodes[0]
+
+    def test_matmul_16bit_quantizer_annotation(self) -> None:
+        """Test that CadenceWith16BitMatmulActivationsQuantizer correctly annotates matmul."""
+        gm, matmul_node = self._build_matmul_graph()
+
+        quantizer = CadenceWith16BitMatmulActivationsQuantizer()
+        quantizer.annotate(gm)
+
+        annotation: QuantizationAnnotation = matmul_node.meta[Q_ANNOTATION_KEY]
+        self.assertTrue(annotation._annotated)
+
+        self.assertEqual(annotation.output_qspec, qconfig_A16.output_activation)
+
+        self.assertEqual(len(annotation.input_qspec_map), 2)
+        for _, input_qspec in annotation.input_qspec_map.items():
+            self.assertEqual(input_qspec, qconfig_A16.input_activation)
 
 
 class QuantizerOpsPreserveTest(unittest.TestCase):
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -171,6 +171,22 @@ def _get_convolution_replacement(self, node) -> int:
                 weight_permuted,
             )
 
+            quantized_multiplier_tensor = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_quantized_multiplier",
+                InputKind.PARAMETER,
+                torch.tensor(quantized_multipliers, dtype=torch.int32),
+            )
+
+            quantized_shift_tensor = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_quantized_shift",
+                InputKind.PARAMETER,
+                torch.tensor(quantized_shifts, dtype=torch.int32),
+            )
+
         new_args = (
             x,
             weight_nhwc,
@@ -180,8 +196,8 @@ def _get_convolution_replacement(self, node) -> int:
             dilation,
             -input_zero_point,
             output_zero_point,
-            torch.tensor(quantized_multipliers, dtype=torch.int32),
-            torch.tensor(quantized_shifts, dtype=torch.int32),
+            quantized_multiplier_tensor,
+            quantized_shift_tensor,
             output_qmin,
             output_qmax,
         )
diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md
@@ -31,6 +31,7 @@ Please check `generate_qnn_executorch_compiler_spec()` in
 - SXR2330P
 - QCS9100
 - SAR2230P
+- SW6100
 
 ### Adding more supported Chipset
 Currently, users cannot add additional chipset models because the chipset ID is not accessible to community users. If you have specific chipset models you wish to add, please contact one of the authors in the `Code Reviews` section at the bottom of this page.
diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs
@@ -49,6 +49,7 @@ enum QcomChipset: int {
   QCS9100 = 77,
   SAR2230P = 95,
   SA8255 = 52,
+  SW6100 = 96,
 }
 
 /// Indicate the information of the specified SoC.
diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
@@ -55,6 +55,7 @@ class QcomChipset(IntEnum):
     QCS9100 = 77  # v73
     SAR2230P = 95  # v81
     SA8255 = 52  # v73
+    SW6100 = 96  # v81
 
 
 @dataclass
@@ -80,6 +81,7 @@ class SocInfo:
     QcomChipset.SXR2330P: SocInfo(QcomChipset.SXR2330P, HtpInfo(HtpArch.V79, 8)),
     QcomChipset.QCS9100: SocInfo(QcomChipset.QCS9100, HtpInfo(HtpArch.V73, 8)),
     QcomChipset.SAR2230P: SocInfo(QcomChipset.SAR2230P, HtpInfo(HtpArch.V81, 4)),
+    QcomChipset.SW6100: SocInfo(QcomChipset.SW6100, HtpInfo(HtpArch.V81, 4)),
 }
 
 
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -1106,6 +1106,7 @@ def get_soc_to_arch_map():
         "SXR2330P": HtpArch.V79,
         "QCS9100": HtpArch.V73,
         "SAR2230P": HtpArch.V81,
+        "SW6100": HtpArch.V81,
     }
 
 
@@ -1127,6 +1128,7 @@ def get_soc_to_chipset_map():
         "SXR2330P": QcomChipset.SXR2330P,
         "QCS9100": QcomChipset.QCS9100,
         "SAR2230P": QcomChipset.SAR2230P,
+        "SW6100": QcomChipset.SW6100,
     }
 
 
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -272,6 +272,13 @@ def construct_transformer(model_args: ModelArgs) -> Transformer:
                     norm_eps=model_args.norm_eps,
                 )
             )
+        elif (
+            model_args.layer_types
+            and model_args.layer_types[layer_id] == "skip_attention"
+        ):
+            attention = AttentionSkip()
+            transformer_block = TransformerBlock(model_args, attention)
+            layers.append(transformer_block)
         else:
             attention = cls(
                 model_args, layer_id, rope, **model_args.attention_kwargs
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
@@ -19,7 +19,9 @@
  */
 
 #include <cstdint>
+#ifdef ET_BUNDLE_IO_ENABLED
 #include <filesystem>
+#endif // ET_BUNDLE_IO_ENABLED
 #include <fstream>
 #include <iostream>
 #include <memory>
diff --git a/exir/passes/spec_prop_pass.py b/exir/passes/spec_prop_pass.py
@@ -6,14 +6,16 @@
 
 # pyre-strict
 
-from typing import List, Optional
+import operator
+from typing import Optional
 
 import torch
 from executorch.exir.delegate import executorch_call_delegate
-from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
+from executorch.exir.pass_base import ExportPass, ProxyValue
 from executorch.exir.tensor import TensorSpec
 from torch.export.exported_program import ExportGraphSignature
 from torch.fx.node import Node
+from torch.fx.passes.infra.pass_base import PassResult
 from torch.utils import _pytree as pytree
 
 
@@ -52,12 +54,48 @@ class SpecPropPass(ExportPass):
     def __init__(self) -> None:
         super().__init__()
 
-    def on_attr(self, attr: ProxyValue) -> None:
-        attr.node.meta["spec"] = pytree.tree_map_only(
-            torch.Tensor,
-            make_spec,
-            attr.data,
-        )
+    def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        # Re-trace metadata to ensure it's up to date.
+        res = ExportPass()(graph_module)
+        assert res is not None
+        gm = res.graph_module
+
+        def get_spec(x):
+            if hasattr(x, "meta"):
+                return x.meta.get("spec", None)
+            else:
+                return None
+
+        for module in gm.modules():
+            if isinstance(module, torch.fx.GraphModule):
+                for node in module.graph.nodes:
+                    meta_val = node.meta.get("val", None)
+
+                    if node.op == "output":
+                        node.meta["spec"] = pytree.tree_map(get_spec, node.args[0])
+                    elif node.op == "call_function" and node.target == operator.getitem:
+                        value_spec = pytree.tree_map(get_spec, node.args[0])
+                        node.meta["spec"] = value_spec[node.args[1]]
+                    elif (
+                        node.op == "call_function"
+                        and node.target == executorch_call_delegate
+                    ):
+                        # Note: We currently rely on delegate node specs not being regenerated,
+                        # as the spec is set somewhat manually when adding the call delegate node.
+                        # If we regenerate, it can change and break lowering (it becomes a tuple?).
+                        # Ideally, we should figure out how to make the spec regeneration not break
+                        # things.
+                        #
+                        # We do need to regenerate non-call-delegate node specs, as this pass is called
+                        # multiple times in some lowering paths (backends can and do call it).
+                        if "spec" not in node.meta:
+                            node.meta["spec"] = pytree.tree_map(make_spec, meta_val)
+                    else:
+                        node.meta["spec"] = pytree.tree_map(make_spec, meta_val)
+        return res
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        return self(graph_module)
 
     def update_placeholder_tensor_specs(
         self,
@@ -84,85 +122,3 @@ def update_placeholder_tensor_specs(
                 in exported_program.graph_signature.inputs_to_lifted_tensor_constants
             ):
                 spec.const = True
-
-    # pyre-ignore
-    def placeholder(self, name: str, arg, meta):
-        meta["spec"] = make_spec(arg)
-        return super().placeholder(name, arg, meta)
-
-    # pyre-ignore
-    def call_operator(self, op, args, kwargs, meta):
-        args_data, kwargs_data = pytree.tree_map_only(
-            ProxyValue, lambda x: x.data, (args, kwargs)
-        )
-        meta["spec"] = pytree.tree_map(make_spec, op(*args_data, **kwargs_data))
-        return super().call_operator(op, args, kwargs, meta)
-
-    # pyre-ignore
-    def call_getitem(self, value, key: int, meta):
-        meta["spec"] = value.node.meta["spec"][key]
-        return super().call_getitem(value, key, meta)
-
-    # pyre-ignore
-    def call_cond(self, pred, true_fn, false_fn, inputs, meta):
-        # true_fn/false_fn return tensors of the same shape, so we can pick
-        # either one here.
-        *_, true_out_node = true_fn.graph.nodes
-        meta["spec"] = pytree.tree_map(make_spec, true_out_node.meta["val"])
-        return super().call_cond(pred, true_fn, false_fn, inputs, meta)
-
-    def call_while(
-        self,
-        cond_fn: torch.fx.GraphModule,
-        body_fn: torch.fx.GraphModule,
-        carried_inputs: List[ProxyValue],
-        additional_inputs: List[ProxyValue],
-        meta: NodeMetadata,
-    ):
-        meta["spec"] = pytree.tree_map(make_spec, carried_inputs)
-        return super().call_while(
-            cond_fn, body_fn, carried_inputs, additional_inputs, meta
-        )
-
-    def call_map(
-        self,
-        f: torch.fx.GraphModule,
-        mapped_args: List[ProxyValue],
-        operands: List[ProxyValue],
-        meta: NodeMetadata,
-    ) -> ProxyValue:
-        mapped_dim_size = [arg.data for arg in mapped_args][0].size(0)
-        *_, body_out_node = f.graph.nodes
-        body_out_node_fake_tensor = body_out_node.meta["val"]
-        map_fake_tensor = pytree.tree_map_only(
-            torch.Tensor,
-            lambda x: x.new_empty(mapped_dim_size, *x.shape),
-            body_out_node_fake_tensor,
-        )
-        meta["spec"] = pytree.tree_map(make_spec, map_fake_tensor)
-        return super().call_map(f, mapped_args, operands, meta)
-
-    # pyre-ignore
-    def call_delegate(self, lowered_module, args, kwargs, meta):
-        args_data, kwargs_data = pytree.tree_map_only(
-            ProxyValue, lambda x: x.data, (args, kwargs)
-        )
-        # If spec is missing, re-genenrate it with args data
-        if "spec" not in meta:
-            meta["spec"] = pytree.tree_map(
-                make_spec,
-                executorch_call_delegate(lowered_module, *args_data),
-            )
-        return super().call_delegate(lowered_module, args, kwargs, meta)
-
-    # pyre-ignore
-    def output(self, results, meta):
-        # pyre-ignore
-        def get_spec(x):
-            if isinstance(x, ProxyValue):
-                return x.node.meta["spec"]
-            else:
-                return make_spec(x)
-
-        meta["spec"] = pytree.tree_map(get_spec, results)
-        return super().output(results, meta)
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
diff --git a/kernels/portable/cpu/op_linear_scratch_example.cpp b/kernels/portable/cpu/op_linear_scratch_example.cpp
diff --git a/kernels/portable/custom_ops.yaml b/kernels/portable/custom_ops.yaml
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ enum QcomChipset: int {`
`49`	`49`	`QCS9100 = 77,`
`50`	`50`	`SAR2230P = 95,`
`51`	`51`	`SA8255 = 52,`
	`52`	`+ SW6100 = 96,`
`52`	`53`	`}`
`53`	`54`
`54`	`55`	`/// Indicate the information of the specified SoC.`
Original file line number	Diff line number	Diff line change
`@@ -1106,6 +1106,7 @@ def get_soc_to_arch_map():`
`1106`	`1106`	`"SXR2330P": HtpArch.V79,`
`1107`	`1107`	`"QCS9100": HtpArch.V73,`
`1108`	`1108`	`"SAR2230P": HtpArch.V81,`
	`1109`	`+ "SW6100": HtpArch.V81,`
`1109`	`1110`	`}`
`1110`	`1111`
`1111`	`1112`
`@@ -1127,6 +1128,7 @@ def get_soc_to_chipset_map():`
`1127`	`1128`	`"SXR2330P": QcomChipset.SXR2330P,`
`1128`	`1129`	`"QCS9100": QcomChipset.QCS9100,`
`1129`	`1130`	`"SAR2230P": QcomChipset.SAR2230P,`
	`1131`	`+ "SW6100": QcomChipset.SW6100,`
`1130`	`1132`	`}`
`1131`	`1133`
`1132`	`1134`
Original file line number	Diff line number	Diff line change
`@@ -272,6 +272,13 @@ def construct_transformer(model_args: ModelArgs) -> Transformer:`
`272`	`272`	`norm_eps=model_args.norm_eps,`
`273`	`273`	`)`
`274`	`274`	`)`
	`275`	`+ elif (`
	`276`	`+ model_args.layer_types`
	`277`	`+ and model_args.layer_types[layer_id] == "skip_attention"`
	`278`	`+ ):`
	`279`	`+ attention = AttentionSkip()`
	`280`	`+ transformer_block = TransformerBlock(model_args, attention)`
	`281`	`+ layers.append(transformer_block)`
`275`	`282`	`else:`
`276`	`283`	`attention = cls(`
`277`	`284`	`model_args, layer_id, rope, **model_args.attention_kwargs`