Metta-AI
diff --git a/‎agent/src/metta/agent/components/diversity_injection.py‎
Lines changed: 197 additions & 0 deletions b/‎agent/src/metta/agent/components/diversity_injection.py‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎agent/src/metta/agent/policies/vit.py‎
Lines changed: 20 additions & 0 deletions b/‎agent/src/metta/agent/policies/vit.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎metta/rl/loss/cmpo.py‎
Lines changed: 18 additions & 8 deletions b/‎metta/rl/loss/cmpo.py‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎metta/rl/loss/diversity.py‎
Lines changed: 117 additions & 0 deletions b/‎metta/rl/loss/diversity.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎metta/rl/loss/dynamics.py‎
Lines changed: 1 addition & 1 deletion b/‎metta/rl/loss/dynamics.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,197 @@
+"""Diversity injection for RL policy networks.
+
+When reward gradients vanish (stuck in local minima or flat regions), this module
+automatically expands exploration of nearby representational variants by injecting
+agent-specific random perturbations into the encoder output.
+
+Key insight: when PPO loss → 0 (stuck), the diversity loss term automatically
+dominates, pushing α higher and increasing representational spread across agents.
+"""
+
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+from tensordict import TensorDict
+
+from metta.agent.components.component_config import ComponentConfig
+
+
+class DiversityInjectionConfig(ComponentConfig):
+    """Configuration for diversity injection layer."""
+
+    in_key: str
+    out_key: str
+    name: str = "diversity_injection"
+
+    # Number of agent slots to support (should match max agents in training)
+    num_agents: int = 256
+
+    # Low-rank approximation rank for memory efficiency
+    # W = U @ V.T where U, V are (hidden_dim, rank)
+    projection_rank: int = 32
+
+    # Initial value for log_alpha (α = exp(log_alpha))
+    # -1.0 means α starts at ~0.37
+    log_alpha_init: float = -1.0
+
+    # Maximum value for α to prevent explosion
+    alpha_max: float = 5.0
+
+    # Whether to apply LayerNorm after injection for stability
+    use_layer_norm: bool = True
+
+    # Key in TensorDict containing agent IDs (training_env_ids by default)
+    agent_id_key: str = "training_env_ids"
+
+    def make_component(self, env=None) -> nn.Module:
+        return DiversityInjection(config=self)
+
+
+class DiversityInjection(nn.Module):
+    """Applies agent-specific random perturbations to encoder output.
+
+    Architecture:
+        obs → [shared encoder] → h → h + α * perturbation → [policy_head] → logits
+                                                          → [value_head] → value
+
+    Where perturbation = W_rand[agent_id] @ h using low-rank factorization.
+    """
+
+    def __init__(self, config: DiversityInjectionConfig):
+        super().__init__()
+        self.config = config
+        self.in_key = config.in_key
+        self.out_key = config.out_key
+        self.agent_id_key = config.agent_id_key
+        self.alpha_max = config.alpha_max
+
+        # Learned scalar controlling perturbation strength
+        self.log_alpha = nn.Parameter(torch.tensor(config.log_alpha_init))
+
+        # Lazy initialization - we don't know hidden_dim until first forward
+        self._hidden_dim: int | None = None
+
+        # Register placeholder buffers (will be replaced on first forward)
+        self.register_buffer("_projection_u", None)
+        self.register_buffer("_projection_v", None)
+
+        self.layer_norm: nn.LayerNorm | None = None
+
+    def _initialize_projections(self, hidden_dim: int, device: torch.device, dtype: torch.dtype) -> None:
+        """Initialize random projection matrices on first forward pass."""
+        if self._hidden_dim == hidden_dim and self._projection_u is not None:
+            # Already initialized, just ensure device matches
+            if self._projection_u.device != device:
+                self._projection_u = self._projection_u.to(device)
+                self._projection_v = self._projection_v.to(device)
+                if self.layer_norm is not None:
+                    self.layer_norm = self.layer_norm.to(device)
+            return
+
+        self._hidden_dim = hidden_dim
+        rank = self.config.projection_rank
+        num_agents = self.config.num_agents
+
+        # Create low-rank factorization: W = U @ V.T
+        # Scale by 1/sqrt(rank) for stable initialization
+        scale = 1.0 / (rank**0.5)
+
+        # Generate deterministic random projections per agent using seeded generators
+        projection_u = torch.zeros(num_agents, hidden_dim, rank, dtype=dtype, device=device)
+        projection_v = torch.zeros(num_agents, rank, hidden_dim, dtype=dtype, device=device)
+
+        for agent_idx in range(num_agents):
+            gen = torch.Generator()
+            gen.manual_seed(agent_idx * 31337)  # Deterministic per-agent seed
+            projection_u[agent_idx] = (
+                torch.randn(hidden_dim, rank, generator=gen, dtype=dtype, device="cpu").to(device) * scale
+            )
+            projection_v[agent_idx] = (
+                torch.randn(rank, hidden_dim, generator=gen, dtype=dtype, device="cpu").to(device) * scale
+            )
+
+        # Update buffers in-place
+        self._projection_u = projection_u
+        self._projection_v = projection_v
+
+        # Initialize LayerNorm if enabled
+        if self.config.use_layer_norm and self.layer_norm is None:
+            self.layer_norm = nn.LayerNorm(hidden_dim).to(device)
+
+    @property
+    def alpha(self) -> torch.Tensor:
+        """Current perturbation strength coefficient."""
+        return self.log_alpha.exp().clamp(max=self.alpha_max)
+
+    def forward(self, td: TensorDict) -> TensorDict:
+        h = td[self.in_key]  # (batch, hidden_dim) or (batch, time, hidden_dim)
+
+        # Initialize on first forward
+        self._initialize_projections(h.shape[-1], h.device, h.dtype)
+
+        # Get agent IDs - handle both (batch,) and (batch, time) shapes
+        if self.agent_id_key in td.keys():
+            agent_ids = td[self.agent_id_key]
+            # Flatten to 1D if needed, take first element per batch item if (batch, time)
+            if agent_ids.dim() > 1:
+                agent_ids = agent_ids[:, 0] if agent_ids.shape[1] > 0 else agent_ids.squeeze(-1)
+            agent_ids = agent_ids.long() % self.config.num_agents
+        else:
+            # Default to agent 0 if no agent IDs provided (e.g., during eval)
+            agent_ids = torch.zeros(h.shape[0], dtype=torch.long, device=h.device)
+
+        # Compute perturbation using low-rank factorization
+        # h @ U @ V.T = (h @ U) @ V.T
+        original_shape = h.shape
+        if h.dim() == 3:
+            # (batch, time, hidden) -> (batch * time, hidden)
+            batch, time, hidden = h.shape
+            h_flat = h.reshape(batch * time, hidden)
+            # Expand agent_ids to match flattened batch
+            agent_ids = agent_ids.unsqueeze(1).expand(batch, time).reshape(batch * time)
+        else:
+            h_flat = h
+            batch, time = h.shape[0], 1
+
+        # Gather projection matrices for each sample's agent
+        # _projection_u: (num_agents, hidden_dim, rank)
+        # _projection_v: (num_agents, rank, hidden_dim)
+        u = self._projection_u[agent_ids]  # (batch, hidden_dim, rank)
+        v = self._projection_v[agent_ids]  # (batch, rank, hidden_dim)
+
+        # Compute perturbation: h @ U @ V.T
+        # (batch, hidden) @ (batch, hidden, rank) -> (batch, rank)
+        intermediate = torch.einsum("bh,bhr->br", h_flat, u)
+        # (batch, rank) @ (batch, rank, hidden) -> (batch, hidden)
+        perturbation = torch.einsum("br,brh->bh", intermediate, v)
+
+        # Apply perturbation with learned coefficient
+        alpha = self.alpha
+        h_div = h_flat + alpha * perturbation
+
+        # Apply LayerNorm for stability when α is large
+        if self.layer_norm is not None:
+            h_div = self.layer_norm(h_div)
+
+        # Reshape back if needed
+        if len(original_shape) == 3:
+            h_div = h_div.reshape(original_shape)
+
+        td[self.out_key] = h_div
+
+        return td
+
+    def get_diversity_loss(self) -> torch.Tensor:
+        """Return diversity loss term: -log_alpha.
+
+        This encourages α to grow when other losses are small.
+        """
+        return -self.log_alpha
+
+    def extra_repr(self) -> str:
+        return (
+            f"in_key={self.in_key}, out_key={self.out_key}, "
+            f"num_agents={self.config.num_agents}, rank={self.config.projection_rank}, "
+            f"alpha_max={self.alpha_max}, use_layer_norm={self.config.use_layer_norm}"
+        )
@@ -10,6 +10,7 @@
 from metta.agent.components.actor import ActionProbsConfig, ActorHeadConfig
 from metta.agent.components.component_config import ComponentConfig
 from metta.agent.components.cortex import CortexTDConfig
+from metta.agent.components.diversity_injection import DiversityInjectionConfig
 from metta.agent.components.misc import MLPConfig
 from metta.agent.components.obs_enc import ObsPerceiverLatentConfig
 from metta.agent.components.obs_shim import ObsShimTokensConfig
@@ -168,6 +169,10 @@ class ViTDefaultConfig(PolicyArchitecture):
     # Whether to torch.compile the trunk (Cortex stack)
     core_compile: bool = False
 
+    # Diversity injection - auto-expands exploration when gradients vanish
+    # Enable with losses.diversity.enabled=True losses.diversity.diversity_coef=0.01
+    use_diversity_injection: bool = False
+
     components: List[ComponentConfig] = [
         ObsShimTokensConfig(in_key="env_obs", out_key="obs_shim_tokens", max_tokens=48),
         ObsAttrEmbedFourierConfig(
@@ -233,6 +238,21 @@ def make_policy(self, policy_env_info: PolicyEnvInterface) -> Policy:
             compile_blocks=self.core_compile,
         )
 
+        # Conditionally add diversity injection after Cortex
+        if self.use_diversity_injection:
+            # Find Cortex index and insert diversity injection after it
+            cortex_idx = next(i for i, c in enumerate(self.components) if isinstance(c, CortexTDConfig))
+            # Check if already inserted
+            if not any(isinstance(c, DiversityInjectionConfig) for c in self.components):
+                self.components.insert(
+                    cortex_idx + 1,
+                    DiversityInjectionConfig(
+                        in_key="core",
+                        out_key="core",  # in-place replacement
+                        name="diversity_injection",
+                    ),
+                )
+
         AgentClass = load_symbol(self.class_path)
         if not isinstance(AgentClass, type):
             raise TypeError(f"Loaded symbol {self.class_path} is not a class")
 
@@ -109,14 +109,24 @@ def update_target_network(self) -> None:
             return
 
         with torch.no_grad():
-            for target_param, online_param in zip(
-                self.target_policy.parameters(),
-                self.policy.parameters(),
-                strict=False,
-            ):
-                target_param.data = (
-                    self.cfg.target_ema_decay * target_param.data + (1 - self.cfg.target_ema_decay) * online_param.data
-                )
+            target_state = self.target_policy.state_dict()
+            online_state = self.policy.state_dict()
+
+            for name, online_param in online_state.items():
+                if name in target_state:
+                    target_param = target_state[name]
+                    if target_param.shape == online_param.shape:
+                        target_state[name] = (
+                            self.cfg.target_ema_decay * target_param + (1 - self.cfg.target_ema_decay) * online_param
+                        )
+                    else:
+                        # Shape mismatch (e.g., lazy init resize) - copy directly
+                        target_state[name] = online_param.clone()
+                else:
+                    # New parameter - add it
+                    target_state[name] = online_param.clone()
+
+            self.target_policy.load_state_dict(target_state)
 
     def compute_cmpo_policy(
         self,
 
@@ -0,0 +1,117 @@
+"""Diversity loss for encouraging representational exploration.
+
+This loss works in conjunction with DiversityInjection component to automatically
+increase exploration when policy gradients vanish. The key insight is that when
+PPO loss → 0 (stuck in local minima), the diversity loss term dominates, pushing
+α higher and increasing representational spread across agents.
+"""
+
+from typing import Any
+
+import torch
+from pydantic import Field
+from tensordict import TensorDict
+from torch import Tensor
+from torchrl.data import Composite
+
+from metta.agent.policy import Policy
+from metta.rl.loss.loss import Loss, LossConfig
+from metta.rl.training import ComponentContext, TrainingEnvironment
+
+
+class DiversityLossConfig(LossConfig):
+    """Configuration for diversity loss."""
+
+    # Coefficient for diversity loss term (-log_alpha)
+    # Start small (~0.01) and tune as needed
+    diversity_coef: float = Field(default=0.01, ge=0)
+
+    # Name of the DiversityInjection component in the policy
+    # Used to find the log_alpha parameter
+    diversity_component_name: str = "diversity_injection"
+
+    def create(
+        self,
+        policy: Policy,
+        trainer_cfg: Any,
+        env: TrainingEnvironment,
+        device: torch.device,
+        instance_name: str,
+    ) -> "DiversityLoss":
+        return DiversityLoss(policy, trainer_cfg, env, device, instance_name, self)
+
+
+class DiversityLoss(Loss):
+    """Diversity loss that encourages exploration when policy gradients vanish.
+
+    Loss = -diversity_coef * log_alpha
+
+    When α is small (low diversity), log_alpha is negative, so -log_alpha is positive
+    and this loss encourages α to grow. When PPO loss is meaningful, its gradients
+    dominate and α stays controlled. When stuck (PPO loss ≈ 0), diversity loss
+    dominates and α grows, increasing representational spread.
+    """
+
+    def __init__(
+        self,
+        policy: Policy,
+        trainer_cfg: Any,
+        env: TrainingEnvironment,
+        device: torch.device,
+        instance_name: str,
+        cfg: DiversityLossConfig,
+    ):
+        super().__init__(policy, trainer_cfg, env, device, instance_name, cfg)
+        self._diversity_component = None
+        self._find_diversity_component()
+
+    def _find_diversity_component(self) -> None:
+        """Find the DiversityInjection component in the policy."""
+        if hasattr(self.policy, "components"):
+            component_name = self.cfg.diversity_component_name
+            if component_name in self.policy.components:
+                self._diversity_component = self.policy.components[component_name]
+            else:
+                # Try to find any DiversityInjection component
+                from metta.agent.components.diversity_injection import DiversityInjection
+
+                for _, component in self.policy.components.items():
+                    if isinstance(component, DiversityInjection):
+                        self._diversity_component = component
+                        break
+
+    def get_experience_spec(self) -> Composite:
+        """Diversity loss doesn't require additional experience fields."""
+        return Composite()
+
+    def run_rollout(self, td: TensorDict, context: ComponentContext) -> None:
+        """No-op during rollout - diversity loss only affects training."""
+        pass
+
+    def run_train(
+        self, shared_loss_data: TensorDict, context: ComponentContext, mb_idx: int
+    ) -> tuple[Tensor, TensorDict, bool]:
+        """Compute diversity loss from the DiversityInjection component."""
+        if self._diversity_component is None:
+            # No diversity component found, return zero loss
+            zero_loss = torch.tensor(0.0, device=self.device, requires_grad=True)
+            return zero_loss, shared_loss_data, False
+
+        # Get diversity loss from component
+        diversity_loss = self._diversity_component.get_diversity_loss()
+        weighted_loss = self.cfg.diversity_coef * diversity_loss
+
+        # Track metrics
+        alpha = self._diversity_component.alpha
+        self._track("diversity_loss", weighted_loss)
+        self._track("diversity_alpha", alpha)
+        self._track("diversity_log_alpha", self._diversity_component.log_alpha)
+
+        return weighted_loss, shared_loss_data, False
+
+    def _track(self, key: str, value: Tensor) -> None:
+        """Track a metric value."""
+        if value.numel() == 1:
+            self.loss_tracker[key].append(float(value.item()))
+        else:
+            self.loss_tracker[key].append(float(value.mean().item()))
@@ -14,7 +14,7 @@
 
 class DynamicsConfig(LossConfig):
     returns_step_look_ahead: int = Field(default=1)
-    unroll_steps: int = Field(default=0)
+    unroll_steps: int = Field(default=2)
     returns_pred_coef: float = Field(default=1.0, ge=0, le=1.0)
     reward_pred_coef: float = Field(default=1.0, ge=0, le=1.0)