cp

daveey · daveey · commit f967d87ba352 · 2025-12-02T17:14:50.000-08:00
diff --git a/.bazelversion b/.bazelversion
@@ -0,0 +1 @@
+latest
diff --git a/agent/src/metta/agent/policies/vit.py b/agent/src/metta/agent/policies/vit.py
@@ -27,7 +27,7 @@ def forward(self, td: TensorDict, action: torch.Tensor = None) -> TensorDict:
     if "values" in td.keys():
         td["values"] = td["values"].flatten()
 
-    # Dynamics/Muesli predictions - only if modules were created by Dynamics loss
+    # Dynamics/Muesli predictions - only if modules were created
     if hasattr(self, "returns_pred") and self.returns_pred is not None:
         td["pred_input"] = torch.cat([td["core"], td["logits"]], dim=-1)
         self.returns_pred(td)
@@ -139,7 +139,12 @@ def _prediction_step(self, hidden: torch.Tensor) -> tuple[torch.Tensor, torch.Te
 
 
 class ViTDefaultConfig(PolicyArchitecture):
-    """Speed-optimized ViT variant with lighter token embeddings and attention stack."""
+    """Speed-optimized ViT variant with lighter token embeddings and attention stack.
+
+    The trunk uses Axon blocks (post-up experts with residual connections) for efficient
+    feature processing. Configure trunk depth, layer normalization, and hidden dimension
+    scaling independently.
+    """
 
     class_path: str = "metta.agent.policy_auto_builder.PolicyAutoBuilder"
 
@@ -155,6 +160,12 @@ class ViTDefaultConfig(PolicyArchitecture):
     # Dynamics/Muesli unroll steps - set > 0 to enable dynamics modules
     unroll_steps: int = 0
 
+    # Trunk configuration
+    # Number of Axon layers in the trunk (default: 16 for large model)
+    trunk_num_resnet_layers: int = 1
+    # Enable layer normalization after each trunk layer
+    trunk_use_layer_norm: bool = True
+
     components: List[ComponentConfig] = [
         ObsShimTokensConfig(in_key="env_obs", out_key="obs_shim_tokens", max_tokens=48),
         ObsAttrEmbedFourierConfig(
@@ -180,9 +191,9 @@ class ViTDefaultConfig(PolicyArchitecture):
             key_prefix="vit_cortex_state",
             stack_cfg=build_cortex_auto_config(
                 d_hidden=_latent_dim,
-                num_layers=1,
-                pattern="L",
-                post_norm=False,
+                num_layers=trunk_num_resnet_layers,
+                pattern="A",  # Axon blocks provide residual-like connections
+                post_norm=trunk_use_layer_norm,
             ),
             pass_state_during_training=pass_state_during_training,
         ),
@@ -208,20 +219,8 @@ class ViTDefaultConfig(PolicyArchitecture):
     action_probs_config: ActionProbsConfig = ActionProbsConfig(in_key="logits")
 
     def make_policy(self, policy_env_info: PolicyEnvInterface) -> Policy:
-        # Ensure downstream components match core dimension
-        # (self._latent_dim might have been overridden on the instance without updating all components)
-        cortex = next(c for c in self.components if isinstance(c, CortexTDConfig))
-        core_dim = cortex.out_features or cortex.d_hidden
-
-        actor_mlp = next(c for c in self.components if c.name == "actor_mlp")
-        assert isinstance(actor_mlp, MLPConfig)
-        if actor_mlp.in_features != core_dim:
-            actor_mlp.in_features = core_dim
-
-        critic = next(c for c in self.components if c.name == "critic")
-        assert isinstance(critic, MLPConfig)
-        if critic.in_features != core_dim:
-            critic.in_features = core_dim
+        # Note: trunk configuration (num_layers, layer_norm, scaling) is applied
+        # via the components list definition above, no runtime modification needed
 
         AgentClass = load_symbol(self.class_path)
         policy = AgentClass(policy_env_info, self)
@@ -230,12 +229,12 @@ def make_policy(self, policy_env_info: PolicyEnvInterface) -> Policy:
 
         # Only create dynamics modules if unroll_steps > 0
         if self.unroll_steps > 0:
-            latent_dim = core_dim
-            num_actions = policy.num_actions
+            latent_dim = int(self._latent_dim)
+            num_actions = int(policy.num_actions)
 
             # Dynamics Model: (Hidden + Action) -> (Hidden + Reward)
-            dyn_input_dim = latent_dim + num_actions
-            dyn_output_dim = latent_dim + 1
+            dyn_input_dim = int(latent_dim + num_actions)
+            dyn_output_dim = int(latent_dim + 1)
 
             dynamics_net = nn.Sequential(
                 nn.Linear(dyn_input_dim, 256),
@@ -245,7 +244,8 @@ def make_policy(self, policy_env_info: PolicyEnvInterface) -> Policy:
             policy.dynamics_model = dynamics_net
 
             # Returns/Reward Prediction Heads (for Muesli)
-            pred_input_dim = latent_dim + num_actions
+            # Input: Core + Logits
+            pred_input_dim = int(latent_dim + num_actions)
 
             returns_module = nn.Linear(pred_input_dim, 1)
             reward_module = nn.Linear(pred_input_dim, 1)
diff --git a/agent/src/metta/agent/policy_auto_builder.py b/agent/src/metta/agent/policy_auto_builder.py
@@ -63,8 +63,7 @@ def initialize_to_environment(
         self.to(device)
         if device.type == "cuda":
             self._configure_sdp()
-            torch.backends.cuda.matmul.fp32_precision = "tf32"  # type: ignore[attr-defined]
-            torch.backends.cudnn.conv.fp32_precision = "tf32"  # type: ignore[attr-defined]
+            torch.set_float32_matmul_precision("high")
         logs = []
         for _, value in self.components.items():
             if hasattr(value, "initialize_to_environment"):
diff --git a/metta/rl/training/distributed_helper.py b/metta/rl/training/distributed_helper.py
@@ -54,8 +54,7 @@ def _setup_torch_optimizations(self) -> None:
         """Configure PyTorch for optimal performance."""
         # Keep TF32 fast paths enabled on compatible GPUs (using new API)
         if torch.cuda.is_available() and hasattr(torch.backends, "cuda"):
-            torch.backends.cuda.matmul.fp32_precision = "tf32"  # type: ignore[attr-defined]
-            torch.backends.cudnn.conv.fp32_precision = "tf32"  # type: ignore[attr-defined]
+            torch.set_float32_matmul_precision("high")
             # Enable SDPA optimizations for better attention performance
             torch.backends.cuda.enable_flash_sdp(True)
             torch.backends.cuda.enable_mem_efficient_sdp(True)
diff --git a/metta/tools/train.py b/metta/tools/train.py
@@ -318,8 +318,7 @@ def _configure_torch_backends(self) -> None:
             return
 
         try:
-            torch.backends.cuda.matmul.fp32_precision = "tf32"  # type: ignore[attr-defined]
-            torch.backends.cudnn.conv.fp32_precision = "tf32"  # type: ignore[attr-defined]
+            torch.set_float32_matmul_precision("high")
         except Exception as exc:  # pragma: no cover - diagnostic only
             logger.debug("Skipping CUDA matmul backend configuration: %s", exc)
 
diff --git a/packages/cortex/evaluations/run.py b/packages/cortex/evaluations/run.py
@@ -345,7 +345,7 @@ def _enable_determinism() -> None:
     """Force deterministic behavior where possible (CUDA/cuBLAS/torch)."""
     os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
     torch.use_deterministic_algorithms(True)
-    torch.backends.cuda.matmul.allow_tf32 = False  # type: ignore[attr-defined]
+    torch.backends.cuda.matmul.fp32_precision = "highest"  # type: ignore[attr-defined]
     torch.backends.cudnn.deterministic = True  # type: ignore[attr-defined]
     torch.backends.cudnn.benchmark = False  # type: ignore[attr-defined]
 
diff --git a/packages/cortex/src/cortex/utils.py b/packages/cortex/src/cortex/utils.py
@@ -91,7 +91,8 @@ def configure_tf32_precision() -> None:
     if not torch.cuda.is_available():
         return
 
-    torch.backends.cudnn.conv.fp32_precision = "tf32"  # type: ignore[attr-defined]
+    # Use the official high-level API to avoid conflicts with torch.compile
+    torch.set_float32_matmul_precision("high")
 
 
 __all__ = ["TRITON_AVAILABLE", "select_backend", "configure_tf32_precision"]
diff --git a/packages/pufferlib-core/src/pufferlib/pufferl.py b/packages/pufferlib-core/src/pufferlib/pufferl.py
@@ -77,8 +77,7 @@ class PuffeRL:
     def __init__(self, config, vecenv, policy, logger=None):
         # Backend perf optimization (using new API)
         if torch.cuda.is_available():
-            torch.backends.cuda.matmul.fp32_precision = "tf32"  # type: ignore[attr-defined]
-            torch.backends.cudnn.conv.fp32_precision = "tf32"  # type: ignore[attr-defined]
+            torch.set_float32_matmul_precision("high")
         torch.backends.cudnn.deterministic = config["torch_deterministic"]
         torch.backends.cudnn.benchmark = True