TensorAuto · WilliamYue37 · Jan 26, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/.github/scripts/check_accumulate_grad_sync.py b/.github/scripts/check_accumulate_grad_sync.py
@@ -0,0 +1,40 @@
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from utils import grep_file
+
+from opentau.configs.parser import wrap
+
+
+@dataclass
+class Arg:
+    log_path: str
+    expected_length: int
+    re_pattern: str = r"accelerator\.sync_gradients=(True|False)"
+    gradient_accumulation_steps: int = 2
+
+
+@wrap()
+def main(arg: Arg) -> None:
+    sync_grads = grep_file(arg.log_path, arg.re_pattern, processor=bool)
+    assert len(sync_grads) == arg.expected_length, (
+        f"Expected {arg.expected_length} sync_gradients, found {len(sync_grads)} in {arg.log_path}."
+    )
+    assert all(sg == ((i + 1) % arg.gradient_accumulation_steps == 0) for i, sg in enumerate(sync_grads)), (
+        f"Sync gradients should be set according to "
+        f"gradient_accumulation_steps={arg.gradient_accumulation_steps}, "
+        f"got {sync_grads}."
+    )
diff --git a/.github/scripts/check_loss_drop.py b/.github/scripts/check_loss_drop.py
@@ -0,0 +1,99 @@
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+
+import numpy as np
+from utils import grep_file
+
+from opentau.configs.parser import wrap
+
+
+@dataclass
+class Arg:
+    log_path: str
+    expected_length: int
+    re_pattern: str = r"mse_loss:([0-9.eE+-]+)"
+    gauss_sigma: float = 4.0
+    gauss_truncate: float = 4.0
+    pad_mode: str = "reflect"
+    resume_log_path: str | None = None
+    resume_expected_length: int | None = None
+
+
+def gaussian_smooth(
+    series: list[float], sigma: float, *, truncate: float = 4.0, mode: str = "reflect"
+) -> list[float]:
+    if sigma <= 0:
+        raise ValueError("sigma must be positive")
+
+    x = np.asarray(series, dtype=np.float64)
+
+    radius = int(math.ceil(truncate * sigma))
+    k = np.arange(-radius, radius + 1, dtype=np.float64)
+    kernel = np.exp(-(k**2) / (2 * sigma**2))
+    kernel /= kernel.sum()  # normalize
+
+    pad_width = (radius, radius)
+    x_padded = np.pad(x, pad_width, mode=mode)
+    smoothed = np.convolve(x_padded, kernel, mode="valid")
+
+    return smoothed.tolist()
+
+
+def check_smooth_loss(losses: list[float], expected_length: int, arg: Arg, prefix: str) -> list[float]:
+    print(f"{prefix} raw losses:", losses)
+    assert len(losses) == expected_length, (
+        f"Expected {expected_length} losses, found {len(losses)} in {arg.log_path}."
+    )
+    smoothed = gaussian_smooth(losses, arg.gauss_sigma, truncate=arg.gauss_truncate, mode=arg.pad_mode)
+    print(f"{prefix} smoothed losses:", smoothed)
+    assert smoothed[0] >= smoothed[-1], "Losses should drop over time when smoothed."
+    return smoothed
+
+
+@wrap()
+def main(arg: Arg):
+    losses = grep_file(arg.log_path, arg.re_pattern, processor=float)
+    smoothed = check_smooth_loss(losses, arg.expected_length, arg, "Training")
+
+    if arg.resume_expected_length is None and arg.resume_log_path is None:
+        return
+
+    if arg.resume_expected_length is None or arg.resume_log_path is None:
+        raise ValueError(
+            "Both resume_log_path and resume_expected_length must be provided if one is given. "
+            f"Got resume_log_path: {arg.resume_log_path}, "
+            f"Got resume_expected_length: {arg.resume_expected_length}, "
+        )
+
+    resume_losses = grep_file(arg.resume_log_path, arg.re_pattern, processor=float)
+    resume_smoothed = check_smooth_loss(resume_losses, arg.resume_expected_length, arg, "Resume")
+
+    # resuming start should be closer to the end of the training than the start
+    resume_start = resume_smoothed[0]
+    training_start = smoothed[0]
+    training_end = smoothed[-1]
+    print(
+        f"{resume_start=}, {training_start=}, {training_end=}, "
+        f"{abs(resume_start - training_end)=}, {abs(resume_start - training_start)=}."
+    )
+    assert abs(resume_start - training_end) <= abs(resume_start - training_start), (
+        "Resuming start loss should be closer to the end of the training than the start."
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/check_nonzero_grad_norm.py b/.github/scripts/check_nonzero_grad_norm.py
@@ -0,0 +1,35 @@
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from utils import grep_file
+
+from opentau.configs.parser import wrap
+
+
+@dataclass
+class Arg:
+    log_path: str
+    expected_length: int
+    re_pattern: str = r"grad_norm:([0-9.eE+-]+)"
+
+
+@wrap()
+def main(arg: Arg) -> None:
+    grad_norm = grep_file(arg.log_path, arg.re_pattern, processor=float)
+    assert len(grad_norm) == arg.expected_length, (
+        f"Expected {arg.expected_length} grad_norms, found {len(grad_norm)} in {arg.log_path}."
+    )
+    assert all(g > 0 for g in grad_norm), f"All grad_norms should be greater than zero, got {grad_norm}."
diff --git a/.github/scripts/check_state_keys.py b/.github/scripts/check_state_keys.py
@@ -0,0 +1,128 @@
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from opentau.configs.parser import wrap
+
+MISSING_KEYS = {
+    "hf": {
+        "normalize_inputs.buffer_state.max",
+        "normalize_inputs.buffer_state.min",
+        "normalize_targets.buffer_actions.mean",
+        "normalize_targets.buffer_actions.std",
+        "normalize_actions.buffer_actions.max",
+        "normalize_actions.buffer_actions.min",
+        "unnormalize_outputs.buffer_actions.mean",
+        "unnormalize_outputs.buffer_actions.std",
+        "model.paligemma_with_expert.discrete_action_embedding.weight",
+        "model.paligemma_with_expert.da_head.weight",
+        "model.paligemma_with_expert.da_head.bias",
+    },
+    "local": None,
+}
+
+
+@dataclass
+class Arg:
+    log_path: str
+    source: str
+
+    def __post_init__(self):
+        if self.source not in MISSING_KEYS:
+            raise ValueError(f"--source must be one of {MISSING_KEYS.keys()}. Got {self.source}")
+
+
+def parse_missing_keys(log_path: str) -> list[set[str]]:
+    """Parse missing keys from log file.
+
+    The log format is:
+    Missing keys when loading state dict: N keys
+      - key1
+      - key2
+      ...
+    """
+    all_key_sets = []
+    current_keys = None
+
+    with open(log_path) as f:
+        for line in f:
+            if "Missing keys when loading state dict:" in line:
+                # Start collecting keys for a new occurrence
+                if current_keys is not None:
+                    all_key_sets.append(current_keys)
+                current_keys = set()
+            elif current_keys is not None:
+                # Check if line is a key entry (starts with "  - ")
+                stripped = line.strip()
+                if stripped.startswith("- "):
+                    key = stripped[2:].strip()
+                    current_keys.add(key)
+                elif stripped and not stripped.startswith("-"):
+                    # Non-empty line that's not a key entry means section ended
+                    all_key_sets.append(current_keys)
+                    current_keys = None
+
+    # Don't forget the last set if file ended while collecting
+    if current_keys is not None:
+        all_key_sets.append(current_keys)
+
+    return all_key_sets
+
+
+def check_no_unexpected_keys(log_path: str):
+    """Check that 'Unexpected keys when loading state dict:' does not appear in the log."""
+    print("Checking for unexpected keys")
+    with open(log_path) as f:
+        for line in f:
+            if "Unexpected keys when loading state dict:" in line:
+                raise ValueError(f"Found unexpected keys in log: {line.strip()}")
+    print("Passed - no unexpected keys found")
+
+
+def check_missing_keys(key_sets: list[set[str]], source: str):
+    """Check that all missing key sets match the expected keys."""
+    print("Checking missing keys")
+    expected_keys = MISSING_KEYS[source]
+
+    if expected_keys is None:
+        if key_sets:
+            raise ValueError(f"Found missing keys but expecting none: {key_sets}")
+    elif not key_sets:
+        raise ValueError(f"No missing keys found, should be {expected_keys}")
+    else:
+        for i, keys in enumerate(key_sets):
+            if keys != expected_keys:
+                missing_from_expected = expected_keys - keys
+                extra_in_found = keys - expected_keys
+                raise ValueError(
+                    f"Missing keys mismatch at occurrence {i + 1}:\n"
+                    f"  Expected but not found: {missing_from_expected}\n"
+                    f"  Found but not expected: {extra_in_found}"
+                )
+    print("Passed")
+
+
+@wrap()
+def main(arg: Arg) -> None:
+    # Check that no unexpected keys appear
+    check_no_unexpected_keys(arg.log_path)
+
+    # Parse and check missing keys
+    key_sets = parse_missing_keys(arg.log_path)
+    check_missing_keys(key_sets, arg.source)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/utils.py b/.github/scripts/utils.py
@@ -0,0 +1,27 @@
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def grep_file(file: str, pattern: str, processor=None) -> list:
+    processor = processor or (lambda x: x)
+    values = []
+    with open(file) as f:
+        for line in f:
+            match = re.search(pattern, line)
+            if not match:
+                continue
+            values.append(processor(match.group(1)))
+    return values
diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
@@ -42,14 +42,14 @@ jobs:
 
       - name: Start Instance
         run: |
-          aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg --desired-capacity 1
+          aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg-g6-2xlarge --desired-capacity 1
           echo "Waiting for instance to be ready..."
 
   gpu-test:
     name: Run Pytest on GPU
     needs: start-runner
     runs-on: [g6.2xlarge]
-    timeout-minutes: 60
+    timeout-minutes: 30
 
     container:
       image: nvidia/cuda:12.2.0-devel-ubuntu22.04
@@ -110,4 +110,4 @@ jobs:
 
       - name: Stop Instance
         run: |
-          aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg --desired-capacity 0
+          aws autoscaling set-desired-capacity --auto-scaling-group-name github-runner-asg-g6-2xlarge --desired-capacity 0