Reduce hot-loop syncs in rollout and action sampling

relh · relh · commit bae2839752ac · 2025-12-24T16:52:20.000-05:00
diff --git a/agent/src/metta/agent/util/distribution_utils.py b/agent/src/metta/agent/util/distribution_utils.py
@@ -31,9 +31,8 @@ def sample_actions(action_logits: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tenso
     # Sample actions from categorical distribution (replacement=True is implicit when num_samples=1)
     actions = torch.multinomial(action_probs, num_samples=1).view(-1)  # [batch_size]
 
-    # Extract log-probabilities for sampled actions using advanced indexing
-    batch_indices = torch.arange(actions.shape[0], device=actions.device)
-    act_log_prob = full_log_probs[batch_indices, actions]  # [batch_size]
+    # Extract log-probabilities for sampled actions without arange churn
+    act_log_prob = full_log_probs.gather(1, actions.view(-1, 1)).squeeze(1)  # [batch_size]
 
     # Compute policy entropy: H(π) = -∑π(a|s)log π(a|s)
     entropy = -torch.sum(action_probs * full_log_probs, dim=-1)  # [batch_size]
@@ -65,9 +64,8 @@ def evaluate_actions(action_logits: Tensor, actions: Tensor) -> Tuple[Tensor, Te
     action_log_probs = F.log_softmax(action_logits, dim=-1)  # [batch_size, num_actions]
     action_probs = torch.exp(action_log_probs)  # [batch_size, num_actions]
 
-    # Extract log-probabilities for the provided actions using advanced indexing
-    batch_indices = torch.arange(actions.shape[0], device=actions.device)
-    log_probs = action_log_probs[batch_indices, actions]  # [batch_size]
+    # Extract log-probabilities for the provided actions without arange churn
+    log_probs = action_log_probs.gather(1, actions.view(-1, 1)).squeeze(1)  # [batch_size]
 
     # Compute policy entropy: H(π) = -∑π(a|s)log π(a|s)
     entropy = -torch.sum(action_probs * action_log_probs, dim=-1)  # [batch_size]
diff --git a/metta/rl/loss/loss.py b/metta/rl/loss/loss.py
@@ -145,6 +145,7 @@ class Loss:
     loss_tracker: dict[str, list[float]] | None = None
     _zero_tensor: Tensor | None = None
     _context: ComponentContext | None = None
+    _metric_mb_idx: int | None = field(default=None, init=False, repr=False)
 
     _state_attrs: set[str] = field(default_factory=set, init=False, repr=False)
 
@@ -205,7 +206,11 @@ def train(
         ctx = self._ensure_context(context)
         if not self._loss_gate_allows("train", ctx):
             return self._zero(), shared_loss_data, False
-        return self.run_train(shared_loss_data, ctx, mb_idx)
+        self._metric_mb_idx = mb_idx
+        try:
+            return self.run_train(shared_loss_data, ctx, mb_idx)
+        finally:
+            self._metric_mb_idx = None
 
     def run_train(
         self,
@@ -248,6 +253,10 @@ def stats(self) -> dict[str, float]:
 
     def track_metric(self, key: str, value: Tensor | float) -> None:
         """Track a scalar metric."""
+        interval = getattr(self.trainer_cfg, "loss_metric_interval", 1)
+        if interval > 1 and self._metric_mb_idx is not None:
+            if self._metric_mb_idx % interval != 0:
+                return
         _track_metric(self.loss_tracker, key, value)
 
     def metric_mean(self, key: str) -> float:
diff --git a/metta/rl/trainer_config.py b/metta/rl/trainer_config.py
@@ -89,6 +89,7 @@ class TrainerConfig(Config):
 
     # Debug/perf toggles.
     synchronize_after_optimizer_step: bool = False
+    loss_metric_interval: int = Field(default=1, ge=1)
     update_epochs: int = Field(default=1, gt=0)
     scale_batches_by_world_size: bool = False
 
diff --git a/metta/rl/training/experience.py b/metta/rl/training/experience.py
@@ -51,6 +51,8 @@ def __init__(
 
         # Row-aligned tracking (per-agent row slot id and position within row)
         self.t_in_row = torch.zeros(total_agents, device=self.device, dtype=torch.int32)
+        # Keep a CPU mirror to avoid GPU syncs for scalar reads.
+        self._t_in_row_cpu = torch.zeros(total_agents, device="cpu", dtype=torch.int32)
         self.row_slot_ids = torch.arange(total_agents, device=self.device, dtype=torch.int32) % self.segments
         self.free_idx = total_agents % self.segments
 
@@ -114,7 +116,7 @@ def store(self, data_td: TensorDict, env_id: slice) -> None:
         assert isinstance(env_id, slice), (
             f"TypeError: env_id expected to be a slice for segmented storage. Got {type(env_id).__name__} instead."
         )
-        t_in_row_val = self.t_in_row[env_id.start].item()
+        t_in_row_val = int(self._t_in_row_cpu[env_id.start].item())
         row_ids = self.row_slot_ids[env_id]
 
         # Scheduler updates these keys based on the active losses for the epoch.
@@ -124,6 +126,7 @@ def store(self, data_td: TensorDict, env_id: slice) -> None:
             raise ValueError("No store keys set. set_store_keys() was likely used incorrectly.")
 
         self.t_in_row[env_id] += 1
+        self._t_in_row_cpu[env_id] += 1
 
         if t_in_row_val + 1 >= self.bptt_horizon:
             self._reset_completed_episodes(env_id)
@@ -133,6 +136,7 @@ def _reset_completed_episodes(self, env_id) -> None:
         num_full = env_id.stop - env_id.start
         self.row_slot_ids[env_id] = (self.free_idx + self._range_tensor[:num_full]) % self.segments
         self.t_in_row[env_id] = 0
+        self._t_in_row_cpu[env_id] = 0
         self.free_idx = (self.free_idx + num_full) % self.segments
         self.full_rows += num_full
 
@@ -142,6 +146,7 @@ def reset_for_rollout(self) -> None:
         self.free_idx = self.total_agents % self.segments
         self.row_slot_ids = self._range_tensor % self.segments
         self.t_in_row.zero_()
+        self._t_in_row_cpu.zero_()
 
     def update(self, indices: Tensor, data_td: TensorDict) -> None:
         """Update buffer with new data for given indices."""
diff --git a/recipes/experiment/machina_1.py b/recipes/experiment/machina_1.py
@@ -40,6 +40,7 @@ def train(
         teacher=teacher,
     )
     tt.policy_architecture = policy_architecture or ViTDefaultConfig()
+    tt.trainer.loss_metric_interval = 8
 
     # Explicitly keep full vibe/action definitions so saved checkpoints remain compatible.
     env_cfg = tt.training_env.curriculum.task_generator.env

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@ def train(`
`40`	`40`	`teacher=teacher,`
`41`	`41`	`)`
`42`	`42`	`tt.policy_architecture = policy_architecture or ViTDefaultConfig()`
	`43`	`+ tt.trainer.loss_metric_interval = 8`
`43`	`44`
`44`	`45`	`# Explicitly keep full vibe/action definitions so saved checkpoints remain compatible.`
`45`	`46`	`env_cfg = tt.training_env.curriculum.task_generator.env`