diff --git a/README.md b/README.md
index 7b50c34944..87258ef8c2 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ Check out our [Read The Docs site](https://maxtext.readthedocs.io/en/latest/) or
 See our installation guide to [install MaxText with pip from PyPI](https://maxtext.readthedocs.io/en/latest/install_maxtext.html#from-pypi-recommended).
 
 ## Decoupled mode
-See our guide on running MaxText in decoupled mode, without any GCP dependencies in [Decoupled Mode Guide](https://maxtext.readthedocs.io/en/latest/guides/run_maxtext/decoupled_mode.html).
+See our guide on running MaxText in decoupled mode, without any GCP dependencies in [Decoupled Mode Guide](https://maxtext.readthedocs.io/en/latest/run_maxtext/decoupled_mode.html).
 
 <!-- NEWS START -->
 
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000..02d44dea78
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,14 @@
+"""Top-level shim for importing test_utils
+
+This shim lets test modules import `maxtext.tests`.
+
+"""
+
+from importlib import import_module as _imp
+
+try:
+    test_utils = _imp("maxtext.tests.test_utils")  # noqa: F401
+except Exception:  # pragma: no cover - fail silently if tests not present
+    pass
+
+__all__ = ["test_utils"]
diff --git a/local_datasets/generate_tfds_metadata.py b/local_datasets/generate_tfds_metadata.py
index 8d472b2d64..e886de8c41 100644
--- a/local_datasets/generate_tfds_metadata.py
+++ b/local_datasets/generate_tfds_metadata.py
@@ -18,7 +18,6 @@
   python local_datasets/generate_tfds_metadata.py \
       --root local_datasets/c4_en_dataset_minimal \
       --version 3.1.0 \
-      --source-version 3.0.1 \
       --force
 
 This script creates a tiny TFDS builder and outputs the ``dataset_info.json`` and 
@@ -29,36 +28,22 @@
 """
 from __future__ import annotations
 import os
+import json
 import argparse
 import tensorflow_datasets as tfds  # type: ignore
 
 
-def ensure_symlink(root: str, source_version: str, version: str) -> str:
-  """Ensure a symlink exists from source_version to version under root/c4/en.
-
-  Returns the target version directory path.
-  """
-  src = os.path.join(root, "c4", "en", source_version)
-  dst = os.path.join(root, "c4", "en", version)
-  if not os.path.isdir(src):
-    raise FileNotFoundError(f"Source version directory not found: {src}")
-  if not os.path.lexists(dst):
-    try:
-      os.symlink(src, dst)
-      print(f"Created symlink {dst} -> {src}")
-    except OSError as exc:
-      print(f"Symlink creation failed (continuing): {exc}")
-  else:
-    print(f"Symlink already exists: {dst}")
-  return dst
-
-
 def write_metadata(root: str, version_dir: str, dataset_version: str, force: bool = False) -> None:
   """Write TFDS ``dataset_info.json`` and ``features.json`` for local C4 shards."""
+
   info_path = os.path.join(version_dir, "dataset_info.json")
-  if os.path.exists(info_path) and not force:
-    print("dataset_info.json already exists; skipping overwrite (use --force to regenerate).")
-    return
+  if os.path.exists(info_path):
+    if force:
+      os.remove(info_path)
+      print("Removed existing dataset_info.json due to --force.")
+    else:
+      print("dataset_info.json already exists; skipping overwrite (use --force to regenerate).")
+      return
 
   # Discover shards (we assume they exist and are correct; counts are fixed)
   num_shards_train = 8
@@ -107,6 +92,17 @@ def _generate_examples(self):  # type: ignore[override]
   info.write_to_directory(version_dir)
   print(f"Wrote TFDS dataset_info & features to {version_dir}")
 
+  info_path = os.path.join(version_dir, "dataset_info.json")
+  try:
+    with open(info_path, "r") as f:
+      data = json.load(f)
+    if isinstance(data.get("splits"), dict):
+      data["splits"] = list(data["splits"].values())
+      with open(info_path, "w") as f:
+        json.dump(data, f, indent=2)
+  except Exception as e:
+    print(f"Warning: Could not patch splits in dataset_info.json: {e}")
+
 
 def main() -> None:
   """CLI entry point for generating TFDS metadata."""
@@ -121,11 +117,6 @@ def main() -> None:
       default="3.1.0",
       help="Target version to expose via TFDS",
   )
-  ap.add_argument(
-      "--source-version",
-      default="3.0.1",
-      help="Existing version directory with shards",
-  )
   ap.add_argument(
       "--force",
       action="store_true",
@@ -133,8 +124,11 @@ def main() -> None:
   )
   args = ap.parse_args()
 
-  target_dir = ensure_symlink(args.root, args.source_version, args.version)
-  write_metadata(args.root, target_dir, args.version, force=args.force)
+  # Use the version directory directly
+  version_dir = os.path.join(args.root, "c4", "en", args.version)
+  if not os.path.isdir(version_dir):
+    raise FileNotFoundError(f"Version directory not found: {version_dir}")
+  write_metadata(args.root, version_dir, args.version, force=args.force)
   print("Done.")
 
 
diff --git a/src/MaxText/configs/decoupled_base_test.yml b/src/MaxText/configs/decoupled_base_test.yml
index 650d09e30b..50bdf5bf16 100644
--- a/src/MaxText/configs/decoupled_base_test.yml
+++ b/src/MaxText/configs/decoupled_base_test.yml
@@ -1,9 +1,9 @@
 # Decoupled base test config: used when DECOUPLE_GCLOUD=TRUE for tests that previously relied on base.yml.
-# Inherit all model defaults from base.yml but override any cloud-coupled paths and disable optional cloud features.
-base_config: base.yml
+# Inherit all model defaults (PyDantic already does this)but override any cloud-coupled paths and disable 
+# optional cloud features.
 
 # Output goes to a local relative directory so tests do not require GCS.
-base_output_directory: ./maxtext_local_output
+base_output_directory: ./local_datasets/gcloud_decoupled_test_logs
 run_name: test_decoupled
 
 # Disable checkpointing by default for speed unless a test explicitly enables it.
@@ -23,7 +23,10 @@ profile_periodically_period: 0
 profiler_steps: 0
 
 # Leave dataset-related keys to be overridden by individual tests.
-dataset_type: ""
+#dataset_type: ""
+dataset_path: "local_datasets/c4_en_dataset_minimal/"
+dataset_name: 'c4/en:3.1.0'
+eval_dataset_name: 'c4/en:3.1.0'
 
 # Use dot_product attention to avoid GPU Pallas shared memory limits on AMD GPUs
 attention: "dot_product"
@@ -44,6 +47,8 @@ ici_tensor_sequence_parallelism: 1
 ici_autoregressive_parallelism: 1
 ici_fsdp_parallelism: 1
 ici_fsdp_transpose_parallelism: 1
+# Allow higher unsharded parameter percentage for small device count
+sharding_tolerance: 0.3
 
 # DCN dimensions to 1 (no multi-slice expectation locally).
 dcn_data_parallelism: 1
@@ -68,12 +73,4 @@ goodput_upload_interval_seconds: 0
 enable_pathways_goodput: false
 enable_gcp_goodput_metrics: false
 
-# Disable any cloud logging / BigQuery or external metric uploads.
-enable_cloud_logging: false
-upload_metrics_to_bigquery: false
-bigquery_project: ""
-bigquery_dataset: ""
-bigquery_table: ""
-
-# Force local-only behavior for tests: avoid accidental env pickup.
-tensorboard_dir: "./maxtext_local_output/tensorboard"
+tensorboard_dir: "./local_datasets/gcloud_decoupled_test_logs/tensorboard"
diff --git a/src/MaxText/decode.py b/src/MaxText/decode.py
index 21337bb7ee..579679891c 100644
--- a/src/MaxText/decode.py
+++ b/src/MaxText/decode.py
@@ -15,13 +15,14 @@
 """CLI utility for running inference on a single/multi stream(s)."""
 
 import os
-from typing import Sequence
+from typing import Sequence, Any
 import jax
 import jax.numpy as jnp
 
 from absl import app
 
-from jetstream.engine import engine_api
+from MaxText.gcloud_stub import jetstream, is_decoupled
+_config_lib, engine_api, _token_utils, _tokenizer_api, _token_params_ns = jetstream()
 
 from MaxText import max_utils
 from MaxText import maxengine
@@ -34,7 +35,7 @@
 _NUM_STREAMS = 1
 
 
-def _batch_first_result_token(first_tokens: list[engine_api.ResultTokens], batch_size: int):
+def _batch_first_result_token(first_tokens: list[Any], batch_size: int):
   """Batches together a list of first result tokens from prefill calls.
 
   This is needed because prefill currently returns the first token as a batch of size 1
@@ -112,6 +113,14 @@ def main(argv: Sequence[str]) -> None:
 
   metadata = engine.get_tokenizer()
   tokenizer_model = engine.build_tokenizer(metadata)
+  token_params_is_stub = getattr(_token_params_ns, "_IS_STUB", False)
+  engine_api_is_stub = getattr(engine_api, "_IS_STUB", False)
+  if is_decoupled() and (token_params_is_stub or engine_api_is_stub):
+    raise RuntimeError(
+        "JetStream disabled by DECOUPLE_GCLOUD=TRUE or stubbed; decode requires the JetStream tokenizer. "
+        "Unset DECOUPLE_GCLOUD or install JetStream to run decode."
+    )
+
   try:
     # TODO: update jetstream.engine.tokenizer_api.Tokenizer to maintain tokenizer state.
     has_chat_template = getattr(tokenizer_model.tokenizer, "chat_template", False)  # pytype: disable=attribute-error
diff --git a/src/MaxText/elastic_train.py b/src/MaxText/elastic_train.py
index e2ee2ec958..2c69178b8b 100644
--- a/src/MaxText/elastic_train.py
+++ b/src/MaxText/elastic_train.py
@@ -45,10 +45,8 @@
 
 from absl import app
 
-from cloud_tpu_diagnostics import diagnostic
-from cloud_tpu_diagnostics.configuration import debug_configuration
-from cloud_tpu_diagnostics.configuration import diagnostic_configuration
-from cloud_tpu_diagnostics.configuration import stack_trace_configuration
+from MaxText.gcloud_stub import cloud_diagnostics as _cloud_diag
+diagnostic, debug_configuration, diagnostic_configuration, stack_trace_configuration = _cloud_diag()
 
 import jax
 
@@ -80,7 +78,10 @@
     maybe_monitor_goodput,
     maybe_record_goodput,
 )
-from MaxText.vertex_tensorboard import VertexTensorboardManager
+from MaxText.gcloud_stub import vertex_tensorboard_components, is_decoupled
+from MaxText.gcloud_stub import cloud_diagnostics as _cloud_diag
+VertexTensorboardManager, _vertex_tb_is_stub = vertex_tensorboard_components()
+diagnostic, debug_configuration, diagnostic_configuration, stack_trace_configuration = _cloud_diag()
 
 logging.basicConfig()
 logging.getLogger("pathwaysutils.elastic.manager").setLevel(logging.INFO)
@@ -386,7 +387,10 @@ def main(argv: Sequence[str]) -> None:
   os.environ["TFDS_DATA_DIR"] = config.dataset_path or ""
   vertex_tensorboard_manager = VertexTensorboardManager()
   if config.use_vertex_tensorboard or os.environ.get("UPLOAD_DATA_TO_TENSORBOARD"):
-    vertex_tensorboard_manager.configure_vertex_tensorboard(config)
+    if _vertex_tb_is_stub:
+      max_logging.log("[DECOUPLED NO-OP] skipping Vertex Tensorboard configuration.")
+    else:
+      vertex_tensorboard_manager.configure_vertex_tensorboard(config)
 
   # Create the Goodput recorder
   recorder = create_goodput_recorder(config)
@@ -401,9 +405,15 @@ def main(argv: Sequence[str]) -> None:
   )
   diagnostic_config = diagnostic_configuration.DiagnosticConfig(debug_config)
 
-  with diagnostic.diagnose(diagnostic_config):
-    with maybe_record_goodput(recorder, GoodputEvent.JOB), maybe_monitor_goodput(config):
+  # In decoupled mode or when diagnostics are stubbed, skip the diagnose wrapper
+  if is_decoupled() or getattr(diagnostic, "__class__", None).__name__ == "_StubDiag":
+    max_logging.log("[DECOUPLED NO-OP] skipping cloud diagnostics wrapper.")
+    with maybe_record_goodput(recorder, GoodputEvent.JOB):
       train_loop(config, elastic_manager, recorder)
+  else:
+    with diagnostic.diagnose(diagnostic_config):
+      with maybe_record_goodput(recorder, GoodputEvent.JOB):
+        train_loop(config, elastic_manager, recorder)
 
 
 if __name__ == "__main__":
diff --git a/src/MaxText/experimental/rl/grpo_trainer.py b/src/MaxText/experimental/rl/grpo_trainer.py
index 926f1e1b67..68d9598b46 100644
--- a/src/MaxText/experimental/rl/grpo_trainer.py
+++ b/src/MaxText/experimental/rl/grpo_trainer.py
@@ -57,10 +57,8 @@
 from flax import struct
 from flax.nnx import TrainState
 
-from cloud_tpu_diagnostics import diagnostic
-from cloud_tpu_diagnostics.configuration import debug_configuration
-from cloud_tpu_diagnostics.configuration import diagnostic_configuration
-from cloud_tpu_diagnostics.configuration import stack_trace_configuration
+from MaxText.gcloud_stub import cloud_diagnostics as _cloud_diag, vertex_tensorboard_components, is_decoupled
+diagnostic, debug_configuration, diagnostic_configuration, stack_trace_configuration = _cloud_diag()
 
 import transformers
 
@@ -92,7 +90,7 @@
     maybe_monitor_goodput,
     maybe_record_goodput,
 )
-from MaxText.vertex_tensorboard import VertexTensorboardManager
+VertexTensorboardManager, _vertex_tb_is_stub = vertex_tensorboard_components()
 
 
 # pylint: disable=too-many-positional-arguments
@@ -944,7 +942,10 @@ def main(argv: Sequence[str]) -> None:
   os.environ["TFDS_DATA_DIR"] = config.dataset_path
   vertex_tensorboard_manager = VertexTensorboardManager()
   if config.use_vertex_tensorboard or os.environ.get("UPLOAD_DATA_TO_TENSORBOARD"):
-    vertex_tensorboard_manager.configure_vertex_tensorboard(config)
+    if _vertex_tb_is_stub:
+      max_logging.log("[DECOUPLED NO-OP] skipping Vertex Tensorboard configuration.")
+    else:
+      vertex_tensorboard_manager.configure_vertex_tensorboard(config)
 
   # Create the Goodput recorder
   recorder = create_goodput_recorder(config)
@@ -959,9 +960,14 @@ def main(argv: Sequence[str]) -> None:
   )
   diagnostic_config = diagnostic_configuration.DiagnosticConfig(debug_config)
 
-  with diagnostic.diagnose(diagnostic_config):
-    with maybe_record_goodput(recorder, GoodputEvent.JOB), maybe_monitor_goodput(config):
+  if is_decoupled() or getattr(diagnostic, "__class__", None).__name__ == "_StubDiag":
+    max_logging.log("[DECOUPLED NO-OP] skipping cloud diagnostics wrapper.")
+    with maybe_record_goodput(recorder, GoodputEvent.JOB):
       train_loop(config, config_inference, recorder)
+  else:
+    with diagnostic.diagnose(diagnostic_config):
+      with maybe_record_goodput(recorder, GoodputEvent.JOB):
+        train_loop(config, config_inference, recorder)
 
 
 if __name__ == "__main__":
diff --git a/src/MaxText/gcloud_stub.py b/src/MaxText/gcloud_stub.py
index 852eeedd4f..7c03a09de6 100644
--- a/src/MaxText/gcloud_stub.py
+++ b/src/MaxText/gcloud_stub.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2025 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -150,14 +150,14 @@ def __init__(
 
   # Tokenizer placeholders (unused in decoupled tests due to runtime guard).
   class TokenizerParameters:  # pragma: no cover - placeholder
-    """Stub tokenizer parameters object."""
-
-    def __init__(self, *a, **k):  # pylint: disable=unused-argument
+    def __init__(self, *a, **k):
       pass
 
-  class TokenizerType:  # emulate enum descriptor access pattern
-    """Stub tokenizer type descriptor container."""
+  class _FakeDescriptorValues:
+    def __init__(self):
+      self.values_by_name = {}
 
+  class TokenizerType:  # emulate enum descriptor access pattern
     DESCRIPTOR = SimpleNamespace(values_by_name={})
 
   config_lib = SimpleNamespace()  # not used directly in decoupled tests
@@ -165,11 +165,16 @@ class TokenizerType:  # emulate enum descriptor access pattern
   token_utils = SimpleNamespace()  # build_tokenizer guarded in MaxEngine when decoupled
   tokenizer_api = SimpleNamespace()  # placeholder
   token_params_ns = SimpleNamespace(TokenizerParameters=TokenizerParameters, TokenizerType=TokenizerType)
-  return config_lib, engine_api, token_utils, tokenizer_api, token_params_ns
 
+  # Mark these stub namespaces so callers can detect stubbed jetstream components.
+  setattr(config_lib, "_IS_STUB", True)
+  setattr(engine_api, "_IS_STUB", True)
+  setattr(token_utils, "_IS_STUB", True)
+  setattr(tokenizer_api, "_IS_STUB", True)
+  setattr(token_params_ns, "_IS_STUB", True)
+  return config_lib, engine_api, token_utils, tokenizer_api, token_params_ns
 
 def jetstream():
-  """Return JetStream modules or stubs based on availability and decoupling."""
   needed = [
       "jetstream.core.config_lib",
       "jetstream.engine.engine_api",
@@ -184,18 +189,29 @@ def jetstream():
           print("[DECOUPLED NO-OP] jetstream: dependency missing; using stubs.")
           return _jetstream_stubs()
         raise ModuleNotFoundError(mod)
-
-    from jetstream.core import config_lib  # type: ignore  # pylint: disable=import-outside-toplevel
-    from jetstream.engine import engine_api, token_utils, tokenizer_api  # type: ignore  # pylint: disable=import-outside-toplevel
-    from jetstream.engine.tokenizer_pb2 import TokenizerParameters, TokenizerType  # type: ignore  # pylint: disable=import-outside-toplevel
-
-    return (
-        config_lib,
-        engine_api,
-        token_utils,
-        tokenizer_api,
-        SimpleNamespace(TokenizerParameters=TokenizerParameters, TokenizerType=TokenizerType),
-    )
+    from jetstream.core import config_lib  # type: ignore
+    from jetstream.engine import engine_api, token_utils, tokenizer_api  # type: ignore
+    from jetstream.engine.tokenizer_pb2 import TokenizerParameters, TokenizerType  # type: ignore
+    # Mark real modules as not stubs so consumers can detect the difference.
+    try:
+      setattr(config_lib, "_IS_STUB", False)
+    except Exception:
+      pass
+    try:
+      setattr(engine_api, "_IS_STUB", False)
+    except Exception:
+      pass
+    try:
+      setattr(token_utils, "_IS_STUB", False)
+    except Exception:
+      pass
+    try:
+      setattr(tokenizer_api, "_IS_STUB", False)
+    except Exception:
+      pass
+    token_params_ns = SimpleNamespace(TokenizerParameters=TokenizerParameters, TokenizerType=TokenizerType)
+    setattr(token_params_ns, "_IS_STUB", False)
+    return config_lib, engine_api, token_utils, tokenizer_api, token_params_ns
   except ModuleNotFoundError:
     if is_decoupled():
       print("[DECOUPLED NO-OP] jetstream: dependency missing; using stubs.")
@@ -494,7 +510,55 @@ def vertex_tensorboard_components():
 
 __all__.append("vertex_tensorboard_components")
 
-# ---------------- TensorBoardX (moved stub) -----------------
+# ---------------- ML Diagnostics (google_cloud_mldiagnostics) -----------------
+
+
+def _mldiagnostics_stub():  # pragma: no cover - simple placeholder
+  """Return stub for google_cloud_mldiagnostics."""
+
+  class _StubXprof:
+    """Stub of mldiag.xprof context manager."""
+
+    def __init__(self, *a, **k):  # pylint: disable=unused-argument
+      pass
+
+    def __enter__(self):
+      return self
+
+    def __exit__(self, *a, **k):  # pylint: disable=unused-argument
+      pass
+
+  class _StubMldiag:
+    """Stub of mldiag module."""
+
+    def xprof(self, *a, **k):  # pylint: disable=unused-argument
+      """Return a stub context manager."""
+      return _StubXprof()
+
+  return _StubMldiag(), True
+
+
+def mldiagnostics_modules():
+  """Return (mldiag, is_stub) centralizing stub logic.
+
+  If decoupled OR import fails, returns stub object; otherwise real module.
+  """
+  if is_decoupled():  # fast path: never attempt heavy import
+    print("[DECOUPLED NO-OP] mldiagnostics: using stub.")
+    return _mldiagnostics_stub()
+
+  try:
+    import google_cloud_mldiagnostics as mldiag  # type: ignore  # pylint: disable=import-outside-toplevel
+
+    return mldiag, False
+  except Exception:  # ModuleNotFoundError / ImportError  # pylint: disable=broad-exception-caught
+    print("[NO-OP] mldiagnostics dependency missing; using stub.")
+    return _mldiagnostics_stub()
+
+
+__all__.append("mldiagnostics_modules")
+
+# ------------------------- TensorBoardX --------------------------
 
 try:
   if not is_decoupled():  # Only attempt real import when not decoupled
diff --git a/src/MaxText/gcp_workload_monitor.py b/src/MaxText/gcp_workload_monitor.py
index 93b79fd750..1bdd3b55ff 100644
--- a/src/MaxText/gcp_workload_monitor.py
+++ b/src/MaxText/gcp_workload_monitor.py
@@ -24,9 +24,11 @@
 
 import jax
 
-from google.api import metric_pb2, monitored_resource_pb2
-from google.api_core.exceptions import GoogleAPIError
-from google.cloud import monitoring_v3
+# Centralized monitoring + decoupling imports
+from MaxText.gcloud_stub import monitoring_modules
+
+monitoring_v3, metric_pb2, monitored_resource_pb2, GoogleAPIError, _MONITORING_STUB = monitoring_modules()
+_GCLOUD_AVAILABLE = not _MONITORING_STUB
 
 from urllib3.util.retry import Retry
 
@@ -45,7 +47,7 @@ def __init__(self, run_name: str):
     self.workload_id = f"{run_name if run_name else 'maxtext-unnamed'}-{timestamp}"
     self.zone = get_node_zone()
     self.project_id = get_gcp_project_id()
-    self.client = monitoring_v3.MetricServiceClient()
+    self.client = monitoring_v3.MetricServiceClient() if _GCLOUD_AVAILABLE else None
     self.heartbeat_reporting_started = False
     self.performance_reporting_started = False
     self.termination_event = threading.Event()
@@ -93,6 +95,9 @@ def _report_performance_thread(self, metrics_queue: queue.Queue):
 
   def _report_heartbeat(self, local_rank: str, global_rank: str):
     """Reports heartbeat metric for the process specified by the given local rank & global rank."""
+    if not _GCLOUD_AVAILABLE:
+      max_logging.log("[DECOUPLED NO-OP] heartbeat metric skipped (google monitoring unavailable).")
+      return
     try:
       now = time.time()
       seconds = int(now)
@@ -138,6 +143,9 @@ def _report_heartbeat(self, local_rank: str, global_rank: str):
 
   def _report_performance(self, performance_metric):
     """Reports performance metric to GCP."""
+    if not _GCLOUD_AVAILABLE:
+      max_logging.log("[DECOUPLED NO-OP] performance metric skipped (google monitoring unavailable).")
+      return
     try:
       now = time.time()
       seconds = int(now)
diff --git a/src/MaxText/managed_mldiagnostics.py b/src/MaxText/managed_mldiagnostics.py
index 9b0b5a318b..90ff825477 100644
--- a/src/MaxText/managed_mldiagnostics.py
+++ b/src/MaxText/managed_mldiagnostics.py
@@ -16,7 +16,9 @@
 import json
 from typing import Any
 
-import google_cloud_mldiagnostics as mldiag
+from MaxText.gcloud_stub import mldiagnostics_modules
+
+mldiag, _ = mldiagnostics_modules()
 
 from MaxText.pyconfig import KEYS_NO_LOGGING
 
diff --git a/src/MaxText/max_utils.py b/src/MaxText/max_utils.py
index 5b5811e6ba..8feede7bb0 100644
--- a/src/MaxText/max_utils.py
+++ b/src/MaxText/max_utils.py
@@ -28,6 +28,7 @@
 from etils import epath
 import flax
 import jax
+from pathlib import Path
 from contextlib import contextmanager
 from jax.experimental import mesh_utils
 from jax.sharding import PartitionSpec as P
@@ -36,9 +37,10 @@
 import orbax.checkpoint as ocp
 from orbax.checkpoint.experimental.emergency.multi_tier_checkpointing import initialization
 import psutil
-from tensorboardX import writer
+from MaxText.gcloud_stub import writer, _TENSORBOARDX_AVAILABLE
 
 from MaxText import max_logging
+from MaxText.gcloud_stub import is_decoupled
 from MaxText.common_types import MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE, MODEL_MODE_TRAIN
 
 initialize_multi_tier_checkpointing = initialization.initialize_multi_tier_checkpointing
@@ -139,8 +141,32 @@ def summarize_size_from_pytree(params):
 
 
 def initialize_summary_writer(tensorboard_dir, run_name):
+  if jax.process_index() != 0:
+    return None
+
+  if not _TENSORBOARDX_AVAILABLE:
+      max_logging.log("tensorboardX not available; using no-op SummaryWriter.")
+      return writer.SummaryWriter()
+
+  if is_decoupled():
+    # decoupled and tensorboardX is available -> write to repo-local 'local_tensorboard'
+    try:
+      repo_tb = Path(__file__).resolve().parents[2] / "local_tensorboard"
+      repo_tb.mkdir(parents=True, exist_ok=True)
+      summary_writer_path = str(repo_tb / run_name) if run_name else str(repo_tb)
+      max_logging.log(f"Decoupled: using local tensorboard dir {summary_writer_path}")
+      return writer.SummaryWriter(summary_writer_path)
+    except Exception as e:
+      max_logging.log(f"Decoupled: failed to use local tensorboard dir: {e}; using no-op SummaryWriter.")
+      return writer.SummaryWriter()
+
+  # Check if dir or run_name exists!
+  if not tensorboard_dir or not run_name:
+    max_logging.log("tensorboard_dir or run_name missing; using no-op SummaryWriter to avoid crash.")
+    return writer.SummaryWriter()
+
   summary_writer_path = os.path.join(tensorboard_dir, run_name)
-  return writer.SummaryWriter(summary_writer_path) if jax.process_index() == 0 else None
+  return writer.SummaryWriter(summary_writer_path)
 
 
 def close_summary_writer(summary_writer):
@@ -611,12 +637,18 @@ def print_model_vars(print_str, model_vars):
 
 def get_project():
   """Get project"""
-  completed_command = subprocess.run(["gcloud", "config", "get", "project"], check=True, capture_output=True)
-  project_outputs = completed_command.stdout.decode().strip().split("\n")
-  if len(project_outputs) < 1 or project_outputs[-1] == "":
-    max_logging.log("You must specify config.vertex_tensorboard_project or set 'gcloud config set project <project>'")
+  if is_decoupled():
+    return os.environ.get("LOCAL_GCLOUD_PROJECT", "local-maxtext-project")
+  try:
+    completed_command = subprocess.run(["gcloud", "config", "get", "project"], check=True, capture_output=True)
+    project_outputs = completed_command.stdout.decode().strip().split("\n")
+    if len(project_outputs) < 1 or project_outputs[-1] == "":
+      max_logging.log("You must specify config.vertex_tensorboard_project or set 'gcloud config set project <project>'")
+      return None
+    return project_outputs[-1]
+  except (FileNotFoundError, subprocess.CalledProcessError) as ex:
+    max_logging.log(f"Unable to retrieve gcloud project (decoupled={is_decoupled()}): {ex}")
     return None
-  return project_outputs[-1]
 
 
 def delete_pytree(p):
diff --git a/src/MaxText/maxengine.py b/src/MaxText/maxengine.py
index dff1382d08..3d49e1ebb1 100644
--- a/src/MaxText/maxengine.py
+++ b/src/MaxText/maxengine.py
@@ -36,12 +36,10 @@
 from flax.linen import partitioning as nn_partitioning
 import flax
 
-from jetstream.core import config_lib
-from jetstream.engine import engine_api
-from jetstream.engine import token_utils
-from jetstream.engine import tokenizer_api
-from jetstream.engine.tokenizer_pb2 import TokenizerParameters
-from jetstream.engine.tokenizer_pb2 import TokenizerType
+from MaxText.gcloud_stub import jetstream, is_decoupled
+config_lib, engine_api, token_utils, tokenizer_api, _token_params_ns = jetstream()
+TokenizerParameters = getattr(_token_params_ns, "TokenizerParameters", object)  # type: ignore[assignment]
+TokenizerType = getattr(_token_params_ns, "TokenizerType", object)  # type: ignore[assignment]
 
 from MaxText import inference_utils
 from MaxText import max_utils
@@ -98,14 +96,15 @@ def get_keys(self):
     return self.keys
 
 
-class MaxEngine(engine_api.Engine):
+_BaseEngine = engine_api.Engine if (not is_decoupled() and hasattr(engine_api, "Engine")) else object
+class MaxEngine(_BaseEngine):
   """The computational core of the generative model server.
 
   Engine defines an API that models must adhere to as they plug into the
   JetStream efficient serving infrastructure.
   """
 
-  def __init__(self, config: Any, devices: config_lib.Devices | None = None):
+  def __init__(self, config: Any, devices: Any | None = None):
     self.config = config
 
     # Mesh definition
@@ -142,7 +141,7 @@ def print_stats(self, label: str):
 
   def generate_aot(
       self, params: Params, decode_state: DecodeState, rng: PRNGKeyType | None = None
-  ) -> tuple[DecodeState, engine_api.ResultTokens]:
+  ):  # returns (new_decode_state, result_tokens)
     """Wrapper to generate for ahead of time compilation."""
 
     return self.generate(params=params, decode_state=decode_state, rng=rng)
@@ -392,7 +391,7 @@ def prefill_aot(  # pylint: disable=too-many-positional-arguments
       padded_tokens: jax.Array,
       true_length: int,
       rng: PRNGKeyType | None = None,
-  ) -> tuple[Prefix, engine_api.ResultTokens]:
+  ):  # returns (new_prefix, result_tokens)
     """Wrapper for prefill for ahead-of-time compilation."""
 
     return self.prefill(
@@ -423,7 +422,7 @@ def _prefill_jit(
       topk: int | None = None,
       nucleus_topp: float | None = None,
       temperature: float | None = None,
-  ) -> tuple[Prefix, engine_api.ResultTokens]:
+  ):  # returns (new_prefix, result_tokens)
     """Performs a JIT-compiled prefill operation on a sequence of tokens.
 
     This function processes an input sequence (prompt) through the model to compute
@@ -585,7 +584,7 @@ def prefill(
       topk: int | None = None,
       nucleus_topp: float | None = None,
       temperature: float | None = None,
-  ) -> tuple[Prefix, engine_api.ResultTokens]:
+  ):  # returns (new_prefix, result_tokens)
     """Public API for prefill that updates page state outside JIT."""
     # Update page state before JIT call
     if self.config.attention == "paged" and self.page_manager is not None and self.page_state is not None:
@@ -632,7 +631,7 @@ def prefill_multisampling_aot(  # pylint: disable=too-many-positional-arguments
       topk: int | None = None,
       nucleus_topp: float | None = None,
       temperature: float | None = None,
-  ) -> tuple[Prefix, engine_api.ResultTokens]:
+  ):  # returns (new_prefix, result_tokens)
     """Wrapper for multi-sampling prefill for ahead-of-time compilation."""
     return self.prefill_multisampling(
         params=params,
@@ -661,7 +660,7 @@ def prefill_multisampling(
       topk: int | None = None,
       nucleus_topp: float | None = None,
       temperature: float | None = None,
-  ) -> tuple[Prefix, engine_api.ResultTokens]:
+  ):  # returns (new_prefix, result_tokens)
     """Public API for prefill multisampling."""
 
     # Sample rng before JIT call
@@ -698,7 +697,7 @@ def _prefill_multisampling_jit(
       topk: int | None = None,
       nucleus_topp: float | None = None,
       temperature: float | None = None,
-  ) -> tuple[Prefix, engine_api.ResultTokens]:
+  ) -> tuple[Prefix, Any]:
     """Computes a kv-cache for a new generate request.
 
     With multi-sampling, the engine will generate multiple first tokens in the
@@ -805,7 +804,7 @@ def prefill_concat(
       topk: int | None = None,
       nucleus_topp: float | None = None,
       temperature: float | None = None,
-  ) -> tuple[Any, PackedPrefix, list[engine_api.ResultTokens]]:
+  ):  # returns (maybe_batch, packed_prefix, list_of_result_tokens)
     """Computes a kv-cache for a new packed generate request, which is a
     concatenation of several shorter prompts. Experimentation shows that
     longer prefill sequences gives approximately 15% boost in time per prefilled
@@ -922,7 +921,7 @@ def generate(
       topk: int | None = None,
       nucleus_topp: float | None = None,
       temperature: float | None = None,
-  ) -> tuple[DecodeState, engine_api.ResultTokens]:
+  ):  # returns (decode_state, result_tokens)
     """Public API for generate that updates page state outside JIT."""
 
     # Update page state before JIT call
@@ -965,7 +964,7 @@ def _generate_jit(
       topk: int | None = None,
       nucleus_topp: float | None = None,
       temperature: float | None = None,
-  ) -> tuple[DecodeState, engine_api.ResultTokens]:
+  ):  # returns (decode_state, result_tokens)
     """Performs a single, JIT-compiled autoregressive decoding step.
 
     This function takes the current decoding state, which includes the KV cache
@@ -1486,8 +1485,19 @@ def get_prefix_destination_sharding(self) -> Any:
         "token_logp": self.replicated_sharding,
     }
 
-  def get_tokenizer(self) -> TokenizerParameters:
-    """Return a protobuf of tokenizer info, callable from Py or C++."""
+  def get_tokenizer(self) -> Any:
+    """Return tokenizer parameters; requires JetStream when decoupled.
+
+    When DECOUPLE_GCLOUD is FALSE we provide a clear error instead of failing
+    cryptically on attribute access.
+    """
+    token_params_is_stub = getattr(_token_params_ns, "_IS_STUB", False)
+    engine_api_is_stub = getattr(engine_api, "_IS_STUB", False)
+    if is_decoupled() and (token_params_is_stub or engine_api_is_stub):
+      raise RuntimeError(
+          "JetStream disabled by DECOUPLE_GCLOUD=TRUE or stubbed; get_tokenizer is unsupported. "
+          "Unset DECOUPLE_GCLOUD or install JetStream to enable tokenizer functionality."
+      )
     try:
       tokenizer_type_val = TokenizerType.DESCRIPTOR.values_by_name[self.config.tokenizer_type].number
       return TokenizerParameters(
@@ -1500,8 +1510,15 @@ def get_tokenizer(self) -> TokenizerParameters:
     except KeyError as _:
       raise KeyError(f"Unsupported tokenizer type: {self.config.tokenizer_type}") from None
 
-  def build_tokenizer(self, metadata: TokenizerParameters) -> tokenizer_api.Tokenizer:
+  def build_tokenizer(self, metadata: Any):  # return type depends on JetStream
     """Return a tokenizer"""
+    token_params_is_stub = getattr(_token_params_ns, "_IS_STUB", False)
+    engine_api_is_stub = getattr(engine_api, "_IS_STUB", False)
+    if is_decoupled() and (token_params_is_stub or engine_api_is_stub):
+      raise RuntimeError(
+          "JetStream disabled by DECOUPLE_GCLOUD=TRUE or stubbed; build_tokenizer is unsupported. "
+          "Unset DECOUPLE_GCLOUD or install JetStream to enable tokenizer functionality."
+      )
     if metadata.tokenizer_type == TokenizerType.tiktoken:
       return token_utils.TikToken(metadata)
     elif metadata.tokenizer_type == TokenizerType.sentencepiece:
diff --git a/src/MaxText/maxengine_config.py b/src/MaxText/maxengine_config.py
index 3f7a24e2a0..9a568f20b3 100644
--- a/src/MaxText/maxengine_config.py
+++ b/src/MaxText/maxengine_config.py
@@ -18,24 +18,32 @@
 
 import jax
 
-from jetstream.core import config_lib
-from jetstream.engine import engine_api
+from MaxText.gcloud_stub import jetstream, is_decoupled
+config_lib, engine_api, _token_utils, _tokenizer_api, _token_params_ns = jetstream()
 
 from MaxText import maxengine
 
 
 # TODO: merge it with the above create_maxengine().
-def create_exp_maxengine(devices: config_lib.Devices, config: Any) -> engine_api.Engine:
+def create_exp_maxengine(devices: Any, config: Any):
+  if is_decoupled():
+    return maxengine.MaxEngine(config)
   return maxengine.MaxEngine(config=config, devices=devices)
 
 
-def create_maxengine(devices: config_lib.Devices, config: Any) -> engine_api.Engine:
+def create_maxengine(devices: Any, config: Any) -> engine_api.Engine:
   del devices
   return maxengine.MaxEngine(config)
 
 
-def get_server_config(config_str: str, config: Any) -> Type[config_lib.ServerConfig]:
-  """Gets the Server Config Required by JetStream"""
+def get_server_config(config_str: str, config: Any):
+  """Gets the Server Config Required by JetStream."""
+  # If Jetstream is stub and decoupled, return a minimal stub server config and log the no-op.
+  config_lib_is_stub = getattr(config_lib, "_IS_STUB", False)
+  engine_api_is_stub = getattr(engine_api, "_IS_STUB", False)
+  if is_decoupled() and (config_lib_is_stub or engine_api_is_stub):
+    raise RuntimeError("[DECOUPLED NO-OP] jetstream.config_lib is stubbed; returning minimal server config.")
+  # Not decoupled and no Jetstream found -> allow the later code to raise.
   match config_str:
     case "MaxtextInterleavedServer":
       server_config = config_lib.ServerConfig(
diff --git a/src/MaxText/maxengine_server.py b/src/MaxText/maxengine_server.py
index d7955aad19..d5af08577d 100644
--- a/src/MaxText/maxengine_server.py
+++ b/src/MaxText/maxengine_server.py
@@ -17,11 +17,10 @@
 import os
 import sys
 
-import pathwaysutils  # pylint: disable=unused-import
-
-from jetstream.core import server_lib, config_lib
+from MaxText import gcloud_stub
 
 import jax
+from typing import Any
 
 from MaxText import pyconfig
 from MaxText import maxengine_config
@@ -37,7 +36,7 @@
 # )
 
 
-def _create_prefix_caching_config(config) -> config_lib.PrefixCachingConfig | None:
+def _create_prefix_caching_config(config):
   if not config.enable_prefix_caching:
     return None
 
@@ -51,13 +50,30 @@ def _create_prefix_caching_config(config) -> config_lib.PrefixCachingConfig | No
 
 
 def main(config):
+  # Obtain the jetstream helper modules (or stubs if appropriate).
+  config_lib, _engine_api, _token_utils, _tokenizer_api, _token_params_ns = gcloud_stub.jetstream()
+
+  # If running decoupled and gcloud_stub returned lightweight stubs, skip
+  # starting the real server. Use the explicit _IS_STUB marker when present.
+  config_lib_is_stub = getattr(config_lib, "_IS_STUB", False)
+  engine_api_is_stub = getattr(_engine_api, "_IS_STUB", False)
+  if gcloud_stub.is_decoupled() and (config_lib_is_stub or engine_api_is_stub):
+    raise RuntimeError(
+        "JetStream helper modules are stubbed or DECOUPLE_GCLOUD=TRUE; server cannot be started in decoupled mode. "
+        "Unset DECOUPLE_GCLOUD or install JetStream to run the server."
+    )
+
+  # Import the real server_lib now that it's known present.
+  from jetstream.core import server_lib  # type: ignore
+  import pathwaysutils  # pylint: disable=unused-import
+
   pathwaysutils.initialize()
 
   # No devices for local cpu test. A None for prefill and a None for generate.
   devices = server_lib.get_devices()
   server_config = maxengine_config.get_server_config(config.inference_server, config)
 
-  metrics_server_config: config_lib.MetricsServerConfig | None = None
+  metrics_server_config: Any | None = None
   if config.prometheus_port != 0:
     metrics_server_config = config_lib.MetricsServerConfig(port=config.prometheus_port)
 
diff --git a/src/MaxText/metric_logger.py b/src/MaxText/metric_logger.py
index 77f1574b9b..b535167b45 100644
--- a/src/MaxText/metric_logger.py
+++ b/src/MaxText/metric_logger.py
@@ -25,14 +25,18 @@
 
 import jax
 
-import google_cloud_mldiagnostics as mldiag
+from MaxText.gcloud_stub import mldiagnostics_modules
+
+mldiag, _ = mldiagnostics_modules()
 
 from MaxText import max_logging
 from MaxText import max_utils
 from MaxText import maxtext_utils
 from MaxText.managed_mldiagnostics import ManagedMLDiagnostics
 from MaxText.utils import gcs_utils
-from MaxText.gcp_workload_monitor import GCPWorkloadMonitor
+from MaxText.gcloud_stub import is_decoupled, workload_monitor
+GCPWorkloadMonitor, _monitor_is_stub = workload_monitor()
+
 from MaxText.globals import EPS
 
 from collections import defaultdict
@@ -279,13 +283,15 @@ def write_setup_info_to_tensorboard(self, params):
   def get_performance_metric_queue(self, config):
     """Records heartbeat metrics and performance metrics to GCP."""
     performance_metric_queue = None
-    if config.report_heartbeat_metric_for_gcp_monitoring or config.report_performance_metric_for_gcp_monitoring:
+    if (config.report_heartbeat_metric_for_gcp_monitoring or config.report_performance_metric_for_gcp_monitoring) and not _monitor_is_stub:
       gcp_workload_monitor = GCPWorkloadMonitor(config.run_name)
       if config.report_heartbeat_metric_for_gcp_monitoring:
         gcp_workload_monitor.start_heartbeat_reporting_thread(config.heartbeat_reporting_interval_in_seconds)
       if config.report_performance_metric_for_gcp_monitoring:
         performance_metric_queue = queue.Queue()
         gcp_workload_monitor.start_performance_reporting_thread(performance_metric_queue)
+    elif (config.report_heartbeat_metric_for_gcp_monitoring or config.report_performance_metric_for_gcp_monitoring) and _monitor_is_stub:
+      max_logging.log("[DECOUPLED NO-OP] skipping GCP workload monitoring threads.")
     return performance_metric_queue
 
   def buffer_and_write_train_metrics(self, metrics, step, step_time_delta):
diff --git a/src/MaxText/prefill_packing.py b/src/MaxText/prefill_packing.py
index e39fc564ac..b47117f61f 100644
--- a/src/MaxText/prefill_packing.py
+++ b/src/MaxText/prefill_packing.py
@@ -20,7 +20,16 @@
 import jax.numpy as jnp
 import numpy as np
 
-from jetstream.engine import engine_api
+from MaxText.gcloud_stub import jetstream, is_decoupled
+
+config_lib, engine_api, token_utils, tokenizer_api, token_params_ns = jetstream()
+
+jetstream_is_stub = getattr(config_lib, "_IS_STUB", False) or getattr(engine_api, "_IS_STUB", False)
+
+if is_decoupled() and jetstream_is_stub:
+  raise RuntimeError(
+      "prefill_packing imported while DECOUPLE_GCLOUD=TRUE. This module depends on JetStream."
+  )
 
 from MaxText.maxengine import MaxEngine
 
@@ -116,7 +125,7 @@ def process(
       input_true_length: int,
       rng: PRNGKeyType,
       return_prompt_logp: bool = False,
-  ) -> tuple[engine_api.ResultTokens, DecodeState]:
+  ) -> tuple[Any, DecodeState]:
     """Process a new input."""
 
     process_fn = self._process_compiled(model_params, len(input_tokens_padded), return_prompt_logp)
@@ -162,7 +171,7 @@ def _process(
       decode_state: DecodeState,
       rng: PRNGKeyType,
       return_prompt_logp: bool = False,
-  ) -> tuple[engine_api.ResultTokens, DecodeState]:
+  ) -> tuple[Any, DecodeState]:
     """Prefill and insert a request."""
 
     prefill_result, first_token = self.engine.prefill(
@@ -205,7 +214,7 @@ def process(
       input_prompt: jax.Array,
       input_padding: int,
       capacity: int,
-      prefill_done: Callable[[list[tuple[engine_api.ResultTokens, int]], list[int], DecodeState], None],
+      prefill_done: Callable[[list[tuple[Any, int]], list[int], DecodeState], None],
       return_prompt_logp: bool = False,
   ) -> None:
     """Process a new input.
@@ -241,7 +250,7 @@ def flush(
       self,
       model_params: Params,
       decode_state: DecodeState,
-      prefill_done: Callable[[list[tuple[engine_api.ResultTokens, int]], list[int], DecodeState], None],
+      prefill_done: Callable[[list[tuple[Any, int]], list[int], DecodeState], None],
       return_prompt_logp: bool = False,
   ) -> None:
     """Process all remaining items in buckets."""
@@ -262,7 +271,7 @@ def _process_bucket(
       input_padding: int,
       decode_state: DecodeState,
       return_prompt_logp: bool = False,
-  ) -> tuple[list[tuple[engine_api.ResultTokens, int]], DecodeState]:
+  ) -> tuple[list[tuple[Any, int]], DecodeState]:
     """Process all items in a bucket."""
     # pylint: disable=import-outside-toplevel
     from MaxText.inference.offline_engine import PrefillResult  # type: ignore
@@ -388,7 +397,7 @@ def _process_batch(  # pylint: disable=too-many-positional-arguments
       true_lengths: jax.Array,
       decode_state: DecodeState,
       return_prompt_logp: bool = False,
-  ) -> tuple[list[engine_api.ResultTokens], DecodeState]:
+  ) -> tuple[list[Any], DecodeState]:
     """Prefill and insert a packed request."""
 
     cache, prefix_state, first_tokens = self.engine.prefill_concat(
diff --git a/src/MaxText/profiler.py b/src/MaxText/profiler.py
index 0b0d21163b..c555abf6b7 100644
--- a/src/MaxText/profiler.py
+++ b/src/MaxText/profiler.py
@@ -21,7 +21,9 @@
 
 import jax
 
-import google_cloud_mldiagnostics as mldiag
+from MaxText.gcloud_stub import mldiagnostics_modules
+
+mldiag, _ = mldiagnostics_modules()
 
 from MaxText import max_logging
 from MaxText.managed_mldiagnostics import ManagedMLDiagnostics
diff --git a/src/MaxText/sft/hooks.py b/src/MaxText/sft/hooks.py
index 1fb1afe80c..dced9f9576 100644
--- a/src/MaxText/sft/hooks.py
+++ b/src/MaxText/sft/hooks.py
@@ -34,7 +34,7 @@
 from MaxText import exceptions
 from MaxText import max_logging
 from MaxText import max_utils
-from MaxText import sharding
+from MaxText import maxtext_utils
 from MaxText.data_loader import DataLoader
 from MaxText.input_pipeline.input_pipeline_interface import create_data_iterator
 from MaxText.metric_logger import MetricLogger, MetadataKey
@@ -61,7 +61,7 @@ def on_train_start(self, train_ctx: peft_trainer.PeftTrainer):
     params = state.filter(nnx.Param)
 
     if not self.config.using_pipeline_parallelism:
-      sharding.assert_params_sufficiently_sharded(params, self.mesh, self.config.sharding_tolerance)
+      maxtext_utils.assert_params_sufficiently_sharded(params, self.mesh, self.config.sharding_tolerance)
 
     self.metric_logger.write_setup_info_to_tensorboard(params)
     if MetadataKey.PER_DEVICE_TFLOPS in self.metric_logger.metadata:
diff --git a/src/MaxText/sft/sft_trainer.py b/src/MaxText/sft/sft_trainer.py
index 6e55187444..51d46c6ba2 100644
--- a/src/MaxText/sft/sft_trainer.py
+++ b/src/MaxText/sft/sft_trainer.py
@@ -46,7 +46,7 @@
 
 from orbax import checkpoint as ocp
 
-from tunix.sft import metrics_logger, peft_trainer, profiler
+from tunix.sft import peft_trainer, profiler
 
 from MaxText import max_utils
 from MaxText import max_logging
diff --git a/src/MaxText/train.py b/src/MaxText/train.py
index f5e8cf377b..22d4f1e7cd 100644
--- a/src/MaxText/train.py
+++ b/src/MaxText/train.py
@@ -37,10 +37,9 @@
 from flax import linen as nn
 from flax.linen import partitioning as nn_partitioning
 
-from cloud_tpu_diagnostics import diagnostic
-from cloud_tpu_diagnostics.configuration import debug_configuration
-from cloud_tpu_diagnostics.configuration import diagnostic_configuration
-from cloud_tpu_diagnostics.configuration import stack_trace_configuration
+from MaxText.gcloud_stub import cloud_diagnostics as _cloud_diag, is_decoupled
+_diag_modules = _cloud_diag()
+diagnostic, debug_configuration, diagnostic_configuration, stack_trace_configuration = _diag_modules
 
 from MaxText import checkpointing
 from MaxText import exceptions
@@ -62,7 +61,8 @@
     maybe_monitor_goodput,
     maybe_record_goodput,
 )
-from MaxText.vertex_tensorboard import VertexTensorboardManager
+from MaxText.gcloud_stub import vertex_tensorboard_components
+VertexTensorboardManager, _vertex_tb_is_stub = vertex_tensorboard_components()
 # Placeholder: internal
 
 from MaxText.gradient_accumulation import gradient_accumulation_loss_and_grad
@@ -521,7 +521,10 @@ def initialize(argv: Sequence[str]) -> tuple[pyconfig.HyperParameters, Any, Any]
   os.environ["TFDS_DATA_DIR"] = config.dataset_path or ""
   vertex_tensorboard_manager = VertexTensorboardManager()
   if config.use_vertex_tensorboard or os.environ.get("UPLOAD_DATA_TO_TENSORBOARD"):
-    vertex_tensorboard_manager.configure_vertex_tensorboard(config)
+    if _vertex_tb_is_stub:
+      max_logging.log("[DECOUPLED NO-OP] skipping Vertex Tensorboard configuration.")
+    else:
+      vertex_tensorboard_manager.configure_vertex_tensorboard(config)
 
   # Create the Goodput recorder
   recorder = create_goodput_recorder(config)
@@ -539,15 +542,25 @@ def initialize(argv: Sequence[str]) -> tuple[pyconfig.HyperParameters, Any, Any]
 
 
 def run(config, recorder, diagnostic_config):
-  """Run the job given hyperparameters and utilities"""
-  with (
-      diagnostic.diagnose(diagnostic_config),
-      maybe_record_goodput(recorder, GoodputEvent.JOB),
-      max_utils.maybe_get_transformer_engine_context(config),
-      maybe_monitor_goodput(config),
-  ):
-    train_loop(config, recorder)
+  """Run the job given hyperparameters and utilities.
 
+  In decoupled mode (DECOUPLE_GCLOUD=TRUE) cloud diagnostics may be stubbed; if so, skip wrapping.
+  """
+  if is_decoupled() or getattr(diagnostic, "__class__", None).__name__ == "_StubDiag":  # runtime skip
+    max_logging.log("[DECOUPLED NO-OP] skipping cloud diagnostics wrapper.")
+    with (
+        maybe_record_goodput(recorder, GoodputEvent.JOB),
+        max_utils.maybe_get_transformer_engine_context(config),
+    ):
+        train_loop(config, recorder)
+  else:
+    with (
+        diagnostic.diagnose(diagnostic_config),
+        maybe_record_goodput(recorder, GoodputEvent.JOB),
+        max_utils.maybe_get_transformer_engine_context(config),
+        maybe_monitor_goodput(config),
+    ):
+        train_loop(config, recorder)
 
 def main(argv: Sequence[str]) -> None:
   config, recorder, diagnostic_config = initialize(argv)
diff --git a/src/MaxText/utils/gcs_utils.py b/src/MaxText/utils/gcs_utils.py
index ec379d30b0..19c903a5d5 100644
--- a/src/MaxText/utils/gcs_utils.py
+++ b/src/MaxText/utils/gcs_utils.py
@@ -21,16 +21,30 @@
 
 import yaml
 
-from google.cloud import storage
-
 import jax
 
 from MaxText import max_logging
+from MaxText.gcloud_stub import is_decoupled, gcs_storage
+
+storage = gcs_storage()
+
 
+def _gcs_guard(operation_name: str) -> bool:
+  """Check GCS availability for an operation. """
+  if getattr(storage, "_IS_STUB", False):
+    if is_decoupled():
+      max_logging.log(f"[GCS NO-OP] {operation_name}")
+      return False
+    raise RuntimeError(
+        f"google-cloud-storage missing for {operation_name}. Install or set DECOUPLE_GCLOUD=TRUE."
+    )
+  return True
 
 def write_config_raw_keys_for_gcs(raw_keys):
-  """Writes config raw keys to GCS"""
-  if not raw_keys["save_config_to_gcs"] or jax.process_index() != 0:
+  """Writes config raw keys to GCS (no-op if disabled or decoupled)."""
+  if not raw_keys.get("save_config_to_gcs") or jax.process_index() != 0:
+    return
+  if not _gcs_guard("write_config_raw_keys_for_gcs"):
     return
   max_logging.log("Writing config to GCS...")
 
@@ -60,7 +74,9 @@ def add_trailing_slash(path):
 
 
 def upload_blob(destination_gcs_name, source_file_name):
-  """Uploads a file to a GCS location"""
+  """Uploads a file to a GCS location (no-op if not found and decoupled)."""
+  if not _gcs_guard("upload_blob"):
+    return
   bucket_name, prefix_name = parse_gcs_bucket_and_prefix(destination_gcs_name)
   storage_client = storage.Client()
   bucket = storage_client.get_bucket(bucket_name)
@@ -69,9 +85,11 @@ def upload_blob(destination_gcs_name, source_file_name):
 
 
 def upload_dump(local_dir, target_dir, module_name=None, delete_local_after=True, all_host_upload=False):
-  """Uploads a directory to a GCS location, with an optional filter"""
+  """Uploads a directory to a GCS location, with an optional filter (no-op if not found and decoupled)."""
   if not all_host_upload and jax.process_index() != 0:
     return
+  if not _gcs_guard("upload_dump"):
+    return
   storage_client = storage.Client()
   bucket_name, prefix_name = parse_gcs_bucket_and_prefix(target_dir)
   bucket = storage_client.get_bucket(bucket_name)
@@ -97,7 +115,9 @@ def upload_dump(local_dir, target_dir, module_name=None, delete_local_after=True
 
 
 def gcs_path_exists(file_path):
-  """Checks if a GCS file_path exits."""
+  """Checks if a GCS file_path exists (no-op if not found and decoupled)."""
+  if not _gcs_guard("gcs_path_exists"):
+    return False
   try:
     storage_client = storage.Client()
     bucket_name, file_name = parse_gcs_bucket_and_prefix(file_path)
@@ -120,6 +140,8 @@ def gcs_list_directories(directory_path):
   Returns:
       A list of "directory" names (prefixes).
   """
+  if not _gcs_guard("gcs_list_directories"):
+    return []
   storage_client = storage.Client()
   bucket_name, directory_prefix = parse_gcs_bucket_and_prefix(directory_path)
   bucket = storage_client.bucket(bucket_name)
@@ -166,6 +188,8 @@ def read_json_from_gcs(file_path):
   Returns:
     A dictionary with content from json file.
   """
+  if not _gcs_guard("read_json_from_gcs"):
+    return None
   try:
     storage_client = storage.Client()
     bucket_name, file_prefix = parse_gcs_bucket_and_prefix(file_path)
@@ -190,6 +214,8 @@ def write_dict_to_gcs_json(data_dict, file_path):
     data_dict: The Python dictionary to write
     file_path: GCS path (Bucket + blob) to create the json file
   """
+  if not _gcs_guard("write_dict_to_gcs_json"):
+    return
   try:
     storage_client = storage.Client()
     bucket_name, file_prefix = parse_gcs_bucket_and_prefix(file_path)
diff --git a/src/MaxText/utils/goodput_utils.py b/src/MaxText/utils/goodput_utils.py
index 23fe364269..80030576ec 100644
--- a/src/MaxText/utils/goodput_utils.py
+++ b/src/MaxText/utils/goodput_utils.py
@@ -23,7 +23,9 @@
 import jax
 from MaxText import max_logging
 from enum import Enum
-from ml_goodput_measurement import goodput, monitoring
+from MaxText.gcloud_stub import goodput_modules
+
+goodput, monitoring, _GOODPUT_STUB = goodput_modules()
 
 
 class GoodputEvent(Enum):
@@ -36,6 +38,10 @@ class GoodputEvent(Enum):
 
 @contextlib.contextmanager
 def maybe_monitor_goodput(config):
+  if _GOODPUT_STUB:
+    if config.monitor_goodput and jax.process_index() == 0:
+      max_logging.log("[GOODPUT NO-OP] monitoring disabled (decoupled stub).")
+    return
   """Monitor cumulative goodput if enabled."""
   if not config.monitor_goodput or jax.process_index() != 0:
     yield
@@ -96,6 +102,10 @@ def record_goodput(recorder, event_name, *args):
 
 def create_goodput_recorder(config):
   """Create goodput recorder if `enable_goodput_recording=True`."""
+  if _GOODPUT_STUB:
+    if config.enable_goodput_recording and jax.process_index() == 0:
+      max_logging.log("[GOODPUT NO-OP] recorder skipped (decoupled stub).")
+    return None
   if config.enable_goodput_recording:
     logger_name = f"goodput_{config.run_name}"
     recorder = goodput.GoodputRecorder(config.run_name, logger_name, jax.process_index() == 0)
diff --git a/src/MaxText/vertex_tensorboard.py b/src/MaxText/vertex_tensorboard.py
index 39be293943..6a70c28705 100644
--- a/src/MaxText/vertex_tensorboard.py
+++ b/src/MaxText/vertex_tensorboard.py
@@ -15,6 +15,7 @@
 """Utilities for Tensorboard in Vertex AI."""
 
 import os
+from MaxText.gcloud_stub import is_decoupled
 
 import jax
 
@@ -99,6 +100,10 @@ def upload_data(self, tensorboard_dir):
 
   def configure_vertex_tensorboard(self, config):
     """Creates Vertex Tensorboard and start thread to upload data to Vertex Tensorboard."""
+    # Skip all Vertex related logic when decoupled from Google Cloud.
+    if is_decoupled():
+      max_logging.log("Decoupled mode -> Skipping Vertex Tensorboard configuration.")
+      return
     if jax.process_index() == 0:
       if not os.environ.get("TENSORBOARD_PROJECT"):
         if not config.vertex_tensorboard_project:
diff --git a/tests/aot_hlo_identical_test.py b/tests/aot_hlo_identical_test.py
index c83827997a..5fd9288e55 100644
--- a/tests/aot_hlo_identical_test.py
+++ b/tests/aot_hlo_identical_test.py
@@ -22,11 +22,13 @@
 import unittest
 import pytest
 import os
+from MaxText.gcloud_stub import is_decoupled
 import shutil
 import hashlib
 import re
 import jax
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText import train_compile
 from MaxText import train
 
@@ -39,8 +41,23 @@ def setUp(self):
     Fix the dump dir and xla flags
     """
     jax.config.update("jax_enable_compilation_cache", False)
-    temp_dir = tempfile.gettempdir()
-    self.dump_dir = os.path.join(temp_dir, "aot_test_dump")
+    decoupled = is_decoupled()
+    if decoupled:
+      logs_root = os.path.join(
+          MAXTEXT_PKG_DIR,
+          "..",
+          "local_datasets",
+          "gcloud_decoupled_test_logs",
+          "aot_hlo_identical_test",
+      )
+      os.makedirs(logs_root, exist_ok=True)
+      self._aot_logs_root = logs_root
+    else:
+      self._aot_logs_root = os.path.join(tempfile.gettempdir(), "compile_test_xla_dump")
+      os.makedirs(self._aot_logs_root, exist_ok=True)
+    
+    self.dump_dir = os.path.join(self._aot_logs_root, "aot_test_dump")
+
     xla_dump_options = "--xla_dump_hlo_as_text --xla_dump_hlo_module_re=jit_train_step"
     os.environ["XLA_FLAGS"] = f"--xla_dump_to={self.dump_dir} {xla_dump_options}"
 
@@ -105,10 +122,17 @@ def check_large_files_equal(self, file_path1, file_path2):
 
   def assert_compile_and_real_match_hlo(self, test_name, *extra_args):
     """check that AOT compiled and trained HLO files are identical for a given test"""
-    temp_dir = tempfile.gettempdir()
-    compile_dump_dir = os.path.join(temp_dir, "compile_test_xla_dump", test_name, "aot", "")
+    decoupled = is_decoupled()
+    if decoupled:
+      root = self._aot_logs_root  # set in setUp
+      base_output_directory = root
+    else:
+      root = os.path.join(tempfile.gettempdir(), "compile_test_xla_dump")
+      os.makedirs(root, exist_ok=True)
+      base_output_directory = "gs://runner-maxtext-logs"
+    compile_dump_dir = os.path.join(root, test_name, "aot")
     shared_args = [
-        "base_output_directory=gs://runner-maxtext-logs",
+        f"base_output_directory={base_output_directory}",
         "run_name=compile_equivalent_test",
         "dataset_type=synthetic",
         "steps=1",
@@ -117,12 +141,12 @@ def assert_compile_and_real_match_hlo(self, test_name, *extra_args):
     if extra_args is not None:
       shared_args.extend(extra_args)
 
-    train_dump_dir = os.path.join(temp_dir, "compile_test_xla_dump", test_name, "real", "")
-    train_argv = (None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")) + tuple(shared_args)
+    train_dump_dir = os.path.join(root, test_name, "real")
+    train_argv = (None, get_test_config_path()) + tuple(shared_args)
     topology = self.get_device_user_facing_name()
     aot_args = [f"compile_topology={topology}", "compile_topology_num_slices=1"]
-    compile_argv = (None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")) + tuple(shared_args) + tuple(aot_args)
-    compile_dump_dir = os.path.join(temp_dir, "compile_test_xla_dump", test_name, "aot", "")
+    compile_argv = (None, get_test_config_path()) + tuple(shared_args) + tuple(aot_args)
+    compile_dump_dir = os.path.join(root, test_name, "aot")
 
     # Cleanup directories before use
     self.delete_dir(self.dump_dir, compile_dump_dir, train_dump_dir)
diff --git a/tests/attention_test.py b/tests/attention_test.py
index c5e7c1bbab..82da8fdc4f 100644
--- a/tests/attention_test.py
+++ b/tests/attention_test.py
@@ -43,6 +43,8 @@
 import pytest
 
 from . import attention_test_util
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 
 
 class BidirectionalBlockMaskTest(unittest.TestCase):
@@ -287,10 +289,14 @@ class AttentionTest(parameterized.TestCase):
   def setUp(self):
     """Initializes the configuration for each test"""
     super().setUp()
-    jax.config.update("jax_remove_size_one_mesh_axis_from_type", True)
+    # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
+    if not is_decoupled():
+        jax.config.update("jax_remove_size_one_mesh_axis_from_type", True)
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **self.config_arguments,
+        **extra_args,
     )
     self.cfg = config
 
@@ -658,7 +664,7 @@ def test_tpu_flash_attention_context_parallel(
 
     # Test with Context Parallelism
     cfg_cp = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **self.config_arguments,
         ici_context_parallelism=ici_context_parallelism,
         context_parallel_load_balance=context_parallel_load_balance,
@@ -735,7 +741,7 @@ def _dot_product_attention(
     rtol, atol = 1e-02, 1e-02
 
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         per_device_batch_size=1.0,
         run_name="test",
         enable_checkpointing=False,
@@ -826,7 +832,7 @@ def _dot_product_attention_reshape_q(self, compute_axis_order):
     rtol, atol = 1e-02, 1e-02
 
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         per_device_batch_size=1.0,
         run_name="test",
         enable_checkpointing=False,
@@ -1241,9 +1247,11 @@ def test_projection_initialization(self):
     # Create a copy of the arguments and override the attention_type for the base model
     attention_config_args = self.config_arguments.copy()
     attention_config_args["attention_type"] = AttentionType.GLOBAL.value
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
     attention_cfg = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **attention_config_args,
+        **extra_args,
     )
     dummy_inputs_q = jnp.ones(
         (attention_cfg.global_batch_size_to_train_on, attention_cfg.max_target_length, attention_cfg.base_emb_dim)
@@ -1274,6 +1282,10 @@ def test_projection_initialization(self):
     self.assertTrue(hasattr(base_attention, "out"), "Base Attention should have 'out' projection.")
 
     # 3. Initialize the MLA layer
+    mla_config_args = self.config_arguments.copy()
+    mla_extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
+    mla_config_args.update(mla_extra_args)
+    _, mla_layer = self.init_mla(mla_config_args, rope_type="default")
     _, mla_layer = self.init_mla(self.config_arguments, rope_type="default")
 
     # 4. Assert that the MLA layer DOES NOT HAVE the base projections
@@ -1437,7 +1449,7 @@ def test_tpu_flash_attention_context_parallel(
 
     # Test with Context Parallelism
     cfg_cp = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **config_arguments,
         rope_type=cfg.rope_type,
         ici_context_parallelism=ici_context_parallelism,
diff --git a/tests/attention_test_util.py b/tests/attention_test_util.py
index 1dc4c55dc4..52261d3388 100644
--- a/tests/attention_test_util.py
+++ b/tests/attention_test_util.py
@@ -25,6 +25,8 @@
 from MaxText import max_utils
 from MaxText import maxtext_utils
 from MaxText import pyconfig
+from MaxText.gcloud_stub import is_decoupled
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.common_types import AttentionType, DECODING_ACTIVE_SEQUENCE_INDICATOR, EP_AS_CONTEXT, MODEL_MODE_PREFILL, MODEL_MODE_TRAIN, ShardMode
 from MaxText.globals import MAXTEXT_PKG_DIR
 from MaxText.layers.attention_mla import MLA
@@ -52,10 +54,23 @@ class MLATestBase(parameterized.TestCase):
   def setUp(self):
     """Initializes the configuration for each test"""
     super().setUp()
-    jax.config.update("jax_remove_size_one_mesh_axis_from_type", True)
+    config_args = dict(self.config_arguments)
+    if is_decoupled():  # TODO(gulsumgudukbay): remove this after jax is updated.
+      # Older/newer JAX versions may not recognize this flag; ignore if absent.
+      try:
+        jax.config.update("jax_remove_size_one_mesh_axis_from_type", True)
+      except AttributeError:
+        pass
+      # In decoupled mode, adapt mesh/ICI parallelism to local devices so
+      # fill_unspecified_mesh_axes matches the available device count.
+      config_args.setdefault("mesh_axes", ["data"])
+      config_args.setdefault("ici_data_parallelism", -1)
+    else:
+      jax.config.update("jax_remove_size_one_mesh_axis_from_type", True)
+
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
-        **self.config_arguments,
+        [sys.argv[0], get_test_config_path()],
+        **config_args,
     )
     self.cfg = config
     self.rng = jax.random.PRNGKey(0)
diff --git a/tests/check_gpt_vs_reference.py b/tests/check_gpt_vs_reference.py
index ea748719bf..c23f9fb0c2 100644
--- a/tests/check_gpt_vs_reference.py
+++ b/tests/check_gpt_vs_reference.py
@@ -36,6 +36,7 @@
 import jax.numpy as jnp
 
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText import pyconfig
 from MaxText import maxtext_utils
 from MaxText.layers import attentions, moe, embeddings
@@ -295,7 +296,7 @@ def test_mlp_block(self):
 
     # MaxText model
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="gpt_oss_mlp_test",
         enable_checkpointing=False,
         model_name="default",
@@ -402,7 +403,7 @@ def test_dot_product_attention_with_sinks(self):
     )
 
     cfg_dot = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="gpt_oss_attention_test_dot",
         enable_checkpointing=False,
         model_name="default",
@@ -465,7 +466,7 @@ def test_flash_attention_with_sinks(self):
     )
 
     cfg_flash = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="gpt_oss_attention_test_flash",
         enable_checkpointing=False,
         model_name="default",
diff --git a/tests/check_llama4_layers.py b/tests/check_llama4_layers.py
index 37dc201d36..f4809515e7 100644
--- a/tests/check_llama4_layers.py
+++ b/tests/check_llama4_layers.py
@@ -27,6 +27,7 @@
 from jax.experimental import mesh_utils
 
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.common_types import MODEL_MODE_TRAIN, AttentionType
 from MaxText import pyconfig
 from MaxText import maxtext_utils
@@ -615,7 +616,7 @@ class Config(NamedTuple):
   def setUp(self):
     super().setUp()
     self.cfg = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **self.config_arguments,
     )
     self.rng = jax.random.PRNGKey(0)
@@ -894,7 +895,7 @@ class Config(NamedTuple):
   def setUp(self):
     super().setUp()
     self.cfg = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **self.config_arguments,
     )
     self.rng = jax.random.PRNGKey(0)
diff --git a/tests/check_moba_vs_reference.py b/tests/check_moba_vs_reference.py
index 08388c48b6..6b4b2e58d9 100644
--- a/tests/check_moba_vs_reference.py
+++ b/tests/check_moba_vs_reference.py
@@ -34,6 +34,7 @@
 
 from MaxText import maxtext_utils, pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.layers.attention_op import AttentionOp
 
 # pylint: disable=missing-function-docstring,protected-access
@@ -236,7 +237,7 @@ def _get_jax_results(
   ):
     """Computes results from the JAX implementation."""
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         run_name="moba_test",
         enable_checkpointing=False,
         model_name="default",
@@ -379,7 +380,7 @@ def test_end_to_end_mask(self):
 
         # Get JAX mask
         jax_config = pyconfig.initialize(
-            [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+            [sys.argv[0], get_test_config_path()],
             run_name="moba_test_mask",
             enable_checkpointing=False,
             model_name="default",
diff --git a/tests/check_qwen3_next_vs_reference.py b/tests/check_qwen3_next_vs_reference.py
index e1e1166757..c0c25aca21 100644
--- a/tests/check_qwen3_next_vs_reference.py
+++ b/tests/check_qwen3_next_vs_reference.py
@@ -619,7 +619,7 @@ def setUp(self):
     self.cfg = pyconfig.initialize(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             # Base settings for the test
             "run_name=qwen3_next_test",
             "dtype=float32",
@@ -1074,7 +1074,7 @@ def _run_full_attention_jax_vs_pytorch_attention(self, attention_type):
     cfg = pyconfig.initialize(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             # Base settings for the test
             "run_name=qwen3_next_test",
             "dtype=float32",
diff --git a/tests/context_parallelism_test.py b/tests/context_parallelism_test.py
index 4eb667a0a3..4193034dcb 100644
--- a/tests/context_parallelism_test.py
+++ b/tests/context_parallelism_test.py
@@ -29,6 +29,7 @@
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
 from MaxText import maxtext_utils
+from maxtext.tests.test_utils import get_test_config_path
 
 
 class ContextParallelismTest(unittest.TestCase):
@@ -62,7 +63,7 @@ class ContextParallelismTest(unittest.TestCase):
 
   def setUp(self):
     config_cp = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **self.config_arguments,
         ici_context_parallelism=4,  # use context parallelism of 4
         context_parallel_load_balance=False,  # set load_balancing to False such that
diff --git a/tests/data_loader_test.py b/tests/data_loader_test.py
index 9f1e0ab0c7..c0a769be93 100644
--- a/tests/data_loader_test.py
+++ b/tests/data_loader_test.py
@@ -17,6 +17,7 @@
 import unittest
 import os.path
 import numpy as np
+import pytest
 
 import jax
 
@@ -26,9 +27,11 @@
 from MaxText.data_loader import DataLoader, RampUpDataLoader
 from MaxText.rampup_batch import RampupBatchManager
 from MaxText.maxtext_utils import create_device_mesh
+from MaxText.gcloud_stub import is_decoupled
 from MaxText import exceptions
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 
 
 class DataLoaderTest(unittest.TestCase):
@@ -56,8 +59,16 @@ def get_test_config(self, reuse_example_batch, **kwargs):
         "reuse_example_batch": reuse_example_batch,
     }
     args.update(kwargs)
+
+    # In decoupled mode, adapt mesh/ICI parallelism so that the
+    # product of ICI parallelism matches the available devices for
+    # this test only.
+    if is_decoupled():
+      args.setdefault("mesh_axes", ["data"])
+      args.setdefault("ici_data_parallelism", -1)
+
     return pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         **args,
     )
 
@@ -126,6 +137,7 @@ def test_load_next_batch_throws_exception(self):
       _ = data_loader.load_next_batch()
     self.assertTrue(str(e.exception).startswith("You may have run out of training data."))
 
+  @pytest.mark.external_serving
   def test_rampup_data_loader(self):
     """Tests that RampUpLoader correctly slices and increment."""
     # Mock iterator returns a FULL batch (size 4)
@@ -167,9 +179,26 @@ def test_rampup_data_loader_from_checkpointing(self):
     data_loader = RampUpDataLoader(self.config_rampup, self.mesh, self.mock_data_iterator, None)
 
     # Expected batch sizes based on test config.
-    # The end global batch size is self.num_devices * per_device_batch_size
-    # The rampup should be: 3 steps of size 8, 2 steps of size 12, then size 16.
-    expected_batch_sizes = [8, 8, 8, 12, 12, 16, 16]
+    # The end global batch size is self.num_devices * per_device_batch_size.
+    # In decoupled mode, derive the schedule from a fresh RampupBatchManager
+    # so it matches the actual global batch sizes on the host.
+    if is_decoupled():
+      tmp_manager = RampupBatchManager(self.config_rampup, checkpoint_step)
+      expected_batch_sizes = []
+      # Collect sizes for the ramp-up phase.
+      while True:
+        expected_batch_sizes.append(tmp_manager.global_batch_size_current)
+        rampup_active = tmp_manager.update()
+        if not rampup_active:
+          break
+      # Add a couple of post-ramp-up steps at the final size, mirroring
+      # the original test's intent.
+      for _ in range(2):
+        expected_batch_sizes.append(tmp_manager.global_batch_size_current)
+        tmp_manager.update()
+    else:
+      # The rampup should be: 3 steps of size 8, 2 steps of size 12, then size 16.
+      expected_batch_sizes = [8, 8, 8, 12, 12, 16, 16]
     for i, expected_size in enumerate(expected_batch_sizes):
       batch = data_loader.load_next_batch(rampup_manager=rampup_manager)
       expected_shape = (expected_size, self.config_rampup.max_target_length)
diff --git a/tests/decode_tests.py b/tests/decode_tests.py
index cb50ca7623..8a75154a79 100644
--- a/tests/decode_tests.py
+++ b/tests/decode_tests.py
@@ -16,6 +16,7 @@
 
 import io
 import os
+from MaxText.gcloud_stub import is_decoupled
 import unittest
 
 import pytest
@@ -23,21 +24,33 @@
 from absl.testing import absltest
 from contextlib import redirect_stdout
 
+pytestmark = [pytest.mark.tpu_only, pytest.mark.external_serving]
+
 from MaxText.decode import main as decode_main
 from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT
+from maxtext.tests.test_utils import get_test_config_path
 
 
 class DecodeTests(unittest.TestCase):
   """Tests decode with various configs."""
+  decoupled = is_decoupled()
+  _dataset_path = (
+    os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal") if decoupled else "gs://maxtext-dataset"
+  )
+  _base_output_directory = (
+      os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs")
+      if decoupled
+      else "gs://runner-maxtext-logs"
+  )
 
   GEMMA_2B_CKPT_PATH = "gs://maxtext-gemma/2b/2025-11-04-04-33//0/items"
   CONFIGS = {
       "base": [  # tests decode
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={_dataset_path}",
           "steps=2",
           "enable_checkpointing=False",
           "ici_tensor_parallelism=4",
@@ -47,10 +60,10 @@ class DecodeTests(unittest.TestCase):
       ],
       "int8": [  # tests decode with int8 quantization
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={_dataset_path}",
           "steps=2",
           "enable_checkpointing=False",
           "ici_tensor_parallelism=4",
@@ -62,10 +75,10 @@ class DecodeTests(unittest.TestCase):
       ],
       "pdb_lt_1": [  # tests decode with per_device_batch_size < 1
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={_dataset_path}",
           "steps=2",
           "enable_checkpointing=False",
           "ici_tensor_parallelism=4",
diff --git a/tests/distillation_data_processing_test.py b/tests/distillation_data_processing_test.py
index d98bd56fb4..296b0ce5ed 100644
--- a/tests/distillation_data_processing_test.py
+++ b/tests/distillation_data_processing_test.py
@@ -18,6 +18,7 @@
 import os
 import subprocess
 import unittest
+import pytest
 
 import transformers
 
@@ -69,7 +70,7 @@ def add_arguments_to_parser(parser):
   )
   return parser
 
-
+@pytest.mark.external_training # Calls gsutil to pull tokenizer.
 class DistillationDataProcessingTest(unittest.TestCase):
 
   @classmethod
diff --git a/tests/elastic_train_test.py b/tests/elastic_train_test.py
index 370782109b..9a5a6eac96 100644
--- a/tests/elastic_train_test.py
+++ b/tests/elastic_train_test.py
@@ -31,6 +31,7 @@
 from MaxText import max_utils
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 
 logging.basicConfig()
 logging.getLogger("pathwaysutils.elastic.manager").setLevel(logging.INFO)
@@ -116,7 +117,7 @@ def test_pyconfig_changes(self, good_slice_indices, total_slice_count, base_numb
     config = pyconfig.initialize(
         argv=[
             "test",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
         ],
         per_device_batch_size=base_number,
         checkpoint_period=1234,
diff --git a/tests/flop_calculation_test.py b/tests/flop_calculation_test.py
index 5c2ff253e0..805c6cfffb 100644
--- a/tests/flop_calculation_test.py
+++ b/tests/flop_calculation_test.py
@@ -21,6 +21,7 @@
 from MaxText.maxtext_utils import calculate_tflops_training_per_device
 from MaxText.globals import MAXTEXT_PKG_DIR
 from MaxText import pyconfig
+from maxtext.tests.test_utils import get_test_config_path
 
 
 class FlopCalculation(unittest.TestCase):
@@ -127,7 +128,7 @@ def test_llama2_7b_flops(self):
     golden_param_size = 6.74e9
     golden_tflops = 6 * B * S * golden_param_size / 1e12 + attention_flops
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         **kwargs,
     )
     calculated_tflops, _, _ = calculate_tflops_training_per_device(cfg)
@@ -165,7 +166,7 @@ def test_llama3_8b_flops(self):
     golden_param_size = 7.50e9
     golden_tflops = 6 * B * S * golden_param_size / 1e12 + attention_flops
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         **kwargs,
     )
     calculated_tflops, _, _ = calculate_tflops_training_per_device(cfg)
@@ -203,7 +204,7 @@ def test_mixtral_8x7b_flops(self):
     golden_param_size = 12.9e9
     golden_tflops = 6 * B * S * golden_param_size / 1e12 + attention_flops
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         **kwargs,
     )
     calculated_tflops, _, _ = calculate_tflops_training_per_device(cfg)
@@ -248,7 +249,7 @@ def test_deepseek2_16b_flops(self):
     golden_param_size = 2.4e9
     golden_tflops = 6 * B * S * golden_param_size / 1e12 + attention_flops
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         **kwargs,
     )
     calculated_tflops, _, _ = calculate_tflops_training_per_device(cfg)
@@ -287,7 +288,7 @@ def test_gpt_oss_20b_flops(self):
     golden_param_size = 3.6e9
     golden_tflops = 6 * B * S * golden_param_size / 1e12 + attention_flops
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         **kwargs,
     )
     calculated_tflops, _, _ = calculate_tflops_training_per_device(cfg)
diff --git a/tests/goodput_utils_test.py b/tests/goodput_utils_test.py
index 157cd263a1..007b0bc3d1 100644
--- a/tests/goodput_utils_test.py
+++ b/tests/goodput_utils_test.py
@@ -16,20 +16,32 @@
 
 import os
 import unittest
+import pytest
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from MaxText.gcloud_stub import is_decoupled
+from maxtext.tests.test_utils import get_test_config_path
 from unittest import mock
 from MaxText.utils.goodput_utils import create_goodput_recorder, maybe_monitor_goodput, maybe_record_goodput, GoodputEvent
 
-
+pytestmark = [pytest.mark.external_training]
 class GoodputUtilsTest(unittest.TestCase):
   """Tests for Goodput monitoring and recording."""
 
   def setUp(self):
+    decoupled = is_decoupled()
+    if decoupled:
+      root = self._aot_logs_root  # set in setUp
+      base_output_directory = root
+    else:
+      root = os.path.join(tempfile.gettempdir(), "compile_test_xla_dump")
+      os.makedirs(root, exist_ok=True)
+      base_output_directory = "gs://runner-maxtext-logs"
+
     super().setUp()
     self.config = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
-        base_output_directory="gs://runner-maxtext-logs",
+        [None, get_test_config_path()],
+        f"base_output_directory={base_output_directory}",
         run_name="runner_test",
         enable_checkpointing=False,
         monitor_goodput=True,
diff --git a/tests/gpt3_test.py b/tests/gpt3_test.py
index 2a60e3bb14..255fa451f8 100644
--- a/tests/gpt3_test.py
+++ b/tests/gpt3_test.py
@@ -27,6 +27,7 @@
 from MaxText import maxtext_utils
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.common_types import MODEL_MODE_TRAIN
 from MaxText.layers import models
 from MaxText.layers import quantizations
@@ -59,7 +60,7 @@ class GPT3(unittest.TestCase):
   def setUp(self):
     super().setUp()
     self.cfg = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         run_name="test",
         enable_checkpointing=False,
         model_name="gpt3-52k",
diff --git a/tests/grain_data_processing_test.py b/tests/grain_data_processing_test.py
index 6d5c44b500..7ca6de117e 100644
--- a/tests/grain_data_processing_test.py
+++ b/tests/grain_data_processing_test.py
@@ -18,6 +18,7 @@
 import sys
 import os.path
 import tempfile
+from MaxText.gcloud_stub import is_decoupled
 import unittest
 import json
 
@@ -30,6 +31,9 @@
 from MaxText.input_pipeline import _grain_data_processing
 from MaxText.input_pipeline import input_pipeline_interface
 from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT, MAXTEXT_REPO_ROOT
+from maxtext.tests.test_utils import get_test_config_path
+
+MAXTEXT_ASSETS_ROOT = os.path.join("src", "MaxText", "assets")
 
 
 class GrainArrayRecordProcessingTest(unittest.TestCase):
@@ -42,18 +46,49 @@ def setUpClass(cls):
   def setUp(self):
     super().setUp()
     temp_dir = tempfile.gettempdir()
+    decoupled = is_decoupled()
+
+    if decoupled:
+      grain_train_files = os.path.join(
+          MAXTEXT_PKG_DIR,
+          "..",
+          "local_datasets",
+          "c4_en_dataset_minimal",
+          "c4",
+          "en",
+          "3.0.1",
+          "c4-train.array_record-*",
+      )
+      base_output_directory = os.path.join(
+        MAXTEXT_PKG_DIR,
+        "..",
+        "local_datasets",
+        "gcloud_decoupled_test_logs",
+      )
+    else:
+      grain_train_files = os.path.join(
+          temp_dir,
+          "gcsfuse",
+          "array-record",
+          "c4",
+          "en",
+          "3.0.1",
+          "c4-train.array_record*",
+      )
+      base_output_directory = "gs://max-experiments/"
+    
+    config_file = get_test_config_path()
+
     self.config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], config_file],
         per_device_batch_size=1,
         run_name="test",
         mesh_axes=["data"],
         logical_axis_rules=[["batch", "data"]],
         data_sharding=["data"],
-        base_output_directory="gs://max-experiments/",
+        base_output_directory=base_output_directory,
         dataset_type="grain",
-        grain_train_files=os.path.join(
-            temp_dir, "gcsfuse", "array-record", "c4", "en", "3.0.1", "c4-train.array_record*"
-        ),
+        grain_train_files=grain_train_files,
         tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer"),
         enable_checkpointing=False,
     )
@@ -85,6 +120,7 @@ def test_train_ds(self):
         },
     )
 
+  @pytest.mark.external_serving #Skipped in decoupled mode due to rocBLAS scratch buffer TF issues on GPU
   def test_batch_determinism(self):
     batch1 = next(self.train_iter)
     train_iter = _grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
@@ -112,24 +148,47 @@ def get_first_batch(iterator):
 class GrainArrayRecordProcessingWithMultiSourceBlendingTest(GrainArrayRecordProcessingTest):
 
   def setUp(self):
-    super().setUp()
+    # Override parent setUp to use multi-source blending
     temp_dir = tempfile.gettempdir()
-    # We use the same dataset for testing, but you can use different datasets by changing the file patterns.
-    grain_train_files = [
-        f"{temp_dir}/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record-0000*,0.3",
-        f"{temp_dir}/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record-0001*,0.7",
-    ]
-    grain_train_files = ";".join(grain_train_files)
+    decoupled = is_decoupled()
+
+    if decoupled:
+      base_pattern = os.path.join(
+          MAXTEXT_PKG_DIR,
+          "..",
+          "local_datasets",
+          "c4_en_dataset_minimal",
+          "c4",
+          "en",
+          "3.0.1",
+          "c4-train.array_record-*",
+      )
+      base_output_directory = os.path.join(
+        MAXTEXT_PKG_DIR,
+        "..",
+        "local_datasets",
+        "gcloud_decoupled_test_logs",
+      )
+      config_file = get_test_config_path()
+    else:
+      base_pattern = f"{temp_dir}/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record*"
+      base_output_directory = "gs://max-experiments/"
+      config_file = get_test_config_path()
+      # Ensure GCS fuse mounted for cloud path usage
+      mount_gcsfuse()
+
+    train_files_weighted = ";".join([f"{base_pattern},0.3", f"{base_pattern},0.7"])
+
     self.config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], config_file],
         per_device_batch_size=1,
         run_name="test",
         mesh_axes=["data"],
         logical_axis_rules=[["batch", "data"]],
         data_sharding=["data"],
-        base_output_directory="gs://max-experiments/",
+        base_output_directory=base_output_directory,
         dataset_type="grain",
-        grain_train_files=grain_train_files,
+        grain_train_files=train_files_weighted,
         tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer"),
         enable_checkpointing=False,
     )
@@ -150,10 +209,55 @@ class GrainArrayRecordProcessingWithMixtureConfigTest(GrainArrayRecordProcessing
   def setUp(self):
     super().setUp()
     temp_dir = tempfile.gettempdir()
-    mixture_config = {
-        "ds1": {"path": f"{temp_dir}/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record-0000*", "weight": 0.3},
-        "ds2": {"path": f"{temp_dir}/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record-0001*", "weight": 0.7},
-    }
+    decoupled = is_decoupled()
+
+    if decoupled:
+      mixture_config = {
+        "ds1": {
+          "path": os.path.join(
+            MAXTEXT_PKG_DIR,
+            "..",
+            "local_datasets",
+            "c4_en_dataset_minimal",
+            "c4",
+            "en",
+            "3.0.1",
+            "c4-train.array_record-*",
+          ),
+          "weight": 0.3,
+        },
+        "ds2": {
+          "path": os.path.join(
+            MAXTEXT_PKG_DIR,
+            "..",
+            "local_datasets",
+            "c4_en_dataset_minimal",
+            "c4",
+            "en",
+            "3.0.1",
+            "c4-train.array_record-*",
+          ),
+          "weight": 0.7,
+        },
+      }
+      base_output_directory = os.path.join(
+        MAXTEXT_PKG_DIR,
+        "..",
+        "local_datasets",
+        "gcloud_decoupled_test_logs",
+      )
+    else:
+      mixture_config = {
+        "ds1": {
+          "path": f"{temp_dir}/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record-0000*",
+          "weight": 0.3,
+        },
+        "ds2": {
+          "path": f"{temp_dir}/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record-0001*",
+          "weight": 0.7,
+        },
+      }
+      base_output_directory = "gs://max-experiments/"
     self.mixture_config_path = os.path.join(temp_dir, "mixture_config.json")
     with open(self.mixture_config_path, "w", encoding="utf-8") as f:
       json.dump(mixture_config, f)
@@ -165,7 +269,7 @@ def setUp(self):
         mesh_axes=["data"],
         logical_axis_rules=[["batch", "data"]],
         data_sharding=["data"],
-        base_output_directory="gs://max-experiments/",
+        base_output_directory=base_output_directory,
         dataset_type="grain",
         grain_train_mixture_config_path=self.mixture_config_path,
         tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer"),
@@ -187,8 +291,38 @@ class GrainArrayRecordAutoTuneTest(GrainArrayRecordProcessingTest):
   """Test grain data processing with auto-tuning enabled (grain_worker_count=-1)."""
 
   def setUp(self):
-    super().setUp()
     temp_dir = tempfile.gettempdir()
+    decoupled = is_decoupled()
+
+    if decoupled:
+      grain_train_files = os.path.join(
+          MAXTEXT_PKG_DIR,
+          "..",
+          "local_datasets",
+          "c4_en_dataset_minimal",
+          "c4",
+          "en",
+          "3.0.1",
+          "c4-train.array_record-*",
+      )
+      base_output_directory = os.path.join(
+        MAXTEXT_PKG_DIR,
+        "..",
+        "local_datasets",
+        "gcloud_decoupled_test_logs",
+      )
+    else:
+      grain_train_files = os.path.join(
+          temp_dir,
+          "gcsfuse",
+          "array-record",
+          "c4",
+          "en",
+          "3.0.1",
+          "c4-train.array_record*",
+      )
+      base_output_directory = "gs://max-experiments/"
+
     self.config = pyconfig.initialize(
         [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
         per_device_batch_size=1,
@@ -196,12 +330,10 @@ def setUp(self):
         mesh_axes=["data"],
         logical_axis_rules=[["batch", "data"]],
         data_sharding=["data"],
-        base_output_directory="gs://max-experiments/",
+        base_output_directory=base_output_directory,
         dataset_type="grain",
         grain_ram_budget_mb=512,
-        grain_train_files=os.path.join(
-            temp_dir, "gcsfuse", "array-record", "c4", "en", "3.0.1", "c4-train.array_record*"
-        ),
+        grain_train_files=grain_train_files,
         grain_worker_count=-1,  # Enable auto-tuning
         tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer"),
         enable_checkpointing=False,
@@ -237,17 +369,47 @@ def setUpClass(cls):
   def setUp(self):
     super().setUp()
     temp_dir = tempfile.gettempdir()
+    decoupled = is_decoupled()
+
+    if decoupled:
+      grain_train_file = os.path.join(
+          MAXTEXT_PKG_DIR,
+          "..",
+          "local_datasets",
+          "c4_en_dataset_minimal",
+          "hf",
+          "c4",
+          "c4-train-00000-of-01637.parquet",
+      )
+      base_output_directory = os.path.join(
+        MAXTEXT_PKG_DIR,
+        "..",
+        "local_datasets",
+        "gcloud_decoupled_test_logs",
+      )
+      config_file = get_test_config_path()
+    else:
+      grain_train_file = os.path.join(
+          temp_dir,
+          "gcsfuse",
+          "hf",
+          "c4",
+          "c4-train-00000-of-01637.parquet",
+      )
+    base_output_directory = "gs://max-experiments/"
+    config_file = get_test_config_path()
+
     self.config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], config_file],
         per_device_batch_size=1,
         run_name="test",
         mesh_axes=["data"],
         logical_axis_rules=[["batch", "data"]],
         data_sharding=["data"],
-        base_output_directory="gs://max-experiments/",
+        base_output_directory=base_output_directory,
         dataset_type="grain",
         grain_file_type="parquet",
-        grain_train_files=os.path.join(temp_dir, "gcsfuse", "hf", "c4", "c4-train-00000-of-01637.parquet"),
+        grain_train_files=grain_train_file,
         grain_worker_count=1,
         grain_per_worker_buffer_size=1,
         tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer"),
@@ -310,6 +472,9 @@ def mount_gcsfuse():
   Mounts a GCS bucket (gs://maxtext-dataset) to a local directory (/tmp/gcsfuse)
   using gcsfuse if not already mounted.
   """
+  from MaxText.gcloud_stub import is_decoupled
+  if is_decoupled():
+    return  # No-op when decoupled.
   temp_dir = tempfile.gettempdir()
   mount_path = os.path.join(temp_dir, "gcsfuse")
 
@@ -329,3 +494,4 @@ def mount_gcsfuse():
 if __name__ == "__main__":
   mount_gcsfuse()
   unittest.main()
+
diff --git a/tests/grpo_trainer_correctness_test.py b/tests/grpo_trainer_correctness_test.py
index 23e02c7a99..b6829604ce 100644
--- a/tests/grpo_trainer_correctness_test.py
+++ b/tests/grpo_trainer_correctness_test.py
@@ -54,6 +54,8 @@
 from MaxText.inference.offline_engine import InputData
 from MaxText.experimental.rl import grpo_utils
 
+# This test is for serving pathways via offline_engine and maxengine.
+pytestmark = [pytest.mark.external_serving]
 
 def get_golden_data(config):
   """Get the golden data for GrpoTrainer from maxtext/MaxText/scratch_code/generate_grpo_golden_logits.py."""
diff --git a/tests/hf_data_processing_test.py b/tests/hf_data_processing_test.py
index 622a872fa6..27d0b1afc1 100644
--- a/tests/hf_data_processing_test.py
+++ b/tests/hf_data_processing_test.py
@@ -17,6 +17,7 @@
 import sys
 import unittest
 import os.path
+from MaxText.gcloud_stub import is_decoupled
 
 import jax
 from jax.sharding import Mesh
@@ -24,6 +25,7 @@
 
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.input_pipeline import _hf_data_processing
 from MaxText.input_pipeline import input_pipeline_interface
 
@@ -32,22 +34,28 @@ class HfDataProcessingTest(unittest.TestCase):
 
   def setUp(self):
     super().setUp()
-    config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
-        per_device_batch_size=1,
-        run_name="test",
-        mesh_axes=["data"],
-        logical_axis_rules=[["batch", "data"]],
-        data_sharding=["data"],
-        base_output_directory="gs://max-experiments/",
-        dataset_type="hf",
-        hf_path="parquet",
-        hf_data_dir="",
-        hf_train_files="gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet",
-        tokenizer_path="google-t5/t5-large",
-        enable_checkpointing=False,
+    decoupled = is_decoupled()
+    temp_local_logs = os.path.join("local_datasets", "gcloud_decoupled_test_logs")
+    base_output_directory = temp_local_logs if decoupled else "gs://max-experiments/"
+    self.config = pyconfig.initialize(
+      [sys.argv[0], get_test_config_path()],
+      per_device_batch_size=1,
+      run_name="test",
+      mesh_axes=["data"],
+      logical_axis_rules=[["batch", "data"]],
+      data_sharding=["data"],
+      base_output_directory=base_output_directory,
+      dataset_type="hf",
+      hf_path="parquet",
+      hf_data_dir="",
+      hf_train_files=(
+        os.path.join(
+          "local_datasets","c4_en_dataset_minimal","hf","c4","c4-train-00000-of-01637.parquet"
+        ) if decoupled else "gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet"
+      ),
+      tokenizer_path="google-t5/t5-large",
+      enable_checkpointing=False,
     )
-    self.config = config
     self.mesh_shape_1d = (len(jax.devices()),)
     self.mesh = Mesh(mesh_utils.create_device_mesh(self.mesh_shape_1d), self.config.mesh_axes)
     self.process_indices = input_pipeline_interface.get_process_loading_real_data(
diff --git a/tests/inference/page_manager_test.py b/tests/inference/page_manager_test.py
index 22035c9dde..3534b3c12b 100644
--- a/tests/inference/page_manager_test.py
+++ b/tests/inference/page_manager_test.py
@@ -23,6 +23,7 @@
 
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.inference.page_manager import PageManager, PageState
 
 
@@ -38,7 +39,7 @@ def setUp(self):
     self.max_pages_per_group = (self.max_target_length + self.tokens_per_page - 1) // self.tokens_per_page
 
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         per_device_batch_size=1.0,
         run_name="test",
         enable_checkpointing=False,
diff --git a/tests/inference/test_llama2_7b_bf16.sh b/tests/inference/test_llama2_7b_bf16.sh
index 672611932c..61f64c1c46 100755
--- a/tests/inference/test_llama2_7b_bf16.sh
+++ b/tests/inference/test_llama2_7b_bf16.sh
@@ -1,13 +1,18 @@
 #!/bin/bash
 
+CONFIG_PATH="${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/configs/base.yml"
+if [ "${DECOUPLE_GCLOUD^^}" = "TRUE" ]; then
+  CONFIG_PATH="${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/configs/decoupled_base_test.yml"
+fi
+
 # Define the arguments in an array
 args=(
   "-m"
   "MaxText.decode"
-  "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/configs/base.yml"
+  "${CONFIG_PATH}"
   "tokenizer_path=${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText/assets}}/tokenizer.llama2"
   "model_name=llama2-7b"
-  "load_parameters_path=gs://runner-maxtext-logs/direct_generate_param_only_checkpoint_2024-06-11-04-13/checkpoints/0/items/"
+  "load_parameters_path=gs://runner-maxtext-logs/direct_generate_param_only_checkpoint_2024-06-11-04-13/checkpoints/0/items/" # TODO(gulsumgudukbay) pre-generated checkpoint
   "checkpoint_is_quantized=false"
   "weight_dtype=bfloat16"
   "max_prefill_predict_length=16"
diff --git a/tests/inference/test_llama2_7b_int8.sh b/tests/inference/test_llama2_7b_int8.sh
index 50aa2c0dc9..4dc531359c 100755
--- a/tests/inference/test_llama2_7b_int8.sh
+++ b/tests/inference/test_llama2_7b_int8.sh
@@ -1,13 +1,18 @@
 #!/bin/bash
 
+CONFIG_PATH="${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/configs/base.yml"
+if [ "${DECOUPLE_GCLOUD^^}" = "TRUE" ]; then
+  CONFIG_PATH="${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/configs/decoupled_base_test.yml"
+fi
+
 # Define the arguments in an array
 args=(
   "-m"
   "MaxText.decode"
-  "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/configs/base.yml"
+  "${CONFIG_PATH}"
   "tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText/assets}}"/tokenizer.llama2"
   "model_name=llama2-7b"
-  "load_parameters_path=gs://msingh-bkt/checkpoints/quant_llama2-7b-chat/20241120034012/int8_"
+  "load_parameters_path=gs://msingh-bkt/checkpoints/quant_llama2-7b-chat/20241120034012/int8_" # TODO(gulsumgudukbay):  pre-generated quant checkpoint
   "checkpoint_is_quantized=true"
   "quantization=int8"
   "weight_dtype=bfloat16"
diff --git a/tests/integration_tests/checkpoint_compatibility_test.py b/tests/integration_tests/checkpoint_compatibility_test.py
index 200d575c5f..ea2fcef8c3 100644
--- a/tests/integration_tests/checkpoint_compatibility_test.py
+++ b/tests/integration_tests/checkpoint_compatibility_test.py
@@ -86,6 +86,7 @@ def test_autoselected_attention():
   run_checkpoint_compatibility("tpu", "autoselected")
 
 
+@pytest.mark.external_serving
 @pytest.mark.integration_test
 @pytest.mark.gpu_only
 def test_with_dot_product():
diff --git a/tests/integration_tests/checkpointing_test.py b/tests/integration_tests/checkpointing_test.py
index 4ee93632d8..5cd388a589 100644
--- a/tests/integration_tests/checkpointing_test.py
+++ b/tests/integration_tests/checkpointing_test.py
@@ -28,8 +28,12 @@
 import json
 from math import isclose
 import os.path
+from MaxText.gcloud_stub import is_decoupled
+import glob
+import jax
 import pytest
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.train import main as train_main
 
 
@@ -48,6 +52,11 @@ def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention
   Returns:
     A list of strings representing the command line arguments.
   """
+  base_output_directory = (
+      os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs")
+      if is_decoupled()
+      else "gs://runner-maxtext-logs"
+  )
   model_params = [
       "base_emb_dim=384",
       "base_num_query_heads=8",
@@ -62,10 +71,18 @@ def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention
         "enable_single_controller=True",
         "checkpoint_storage_use_zarr3=False",
     ]
+
+  extra_parallelism = []
+  if is_decoupled():  # Match device topology in decoupled/local mode
+    try:
+      extra_parallelism.append(f"ici_fsdp_parallelism={jax.device_count()}")
+    except Exception as e:  # pragma: no cover - defensive
+      print(f"Warning: unable to determine jax.device_count(): {e}")
+
   return (
       [
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+          get_test_config_path(),
           f"hardware={hardware}",
           f"run_name=runner_{run_date}",
           f"steps={steps}",
@@ -73,7 +90,7 @@ def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention
           "per_device_batch_size=1",
           f"metrics_file={metrics_file}",
           "checkpoint_period=3",
-          "base_output_directory=gs://runner-maxtext-logs",
+          "base_output_directory={base_output_directory}",
           f"dataset_path={dataset_path}",
           f"dataset_type={dataset_type}",
           "async_checkpointing=False",
@@ -81,6 +98,7 @@ def get_checkpointing_command(run_date, hardware, steps, metrics_file, attention
       ]
       + model_params
       + pathways_command
+      + extra_parallelism
   )
 
 
@@ -115,9 +133,25 @@ def run_checkpointing(hardware, attention_type):
     attention_type: The type of attention to use.
   """
   run_date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+  
+  # Determine dataset path/pattern depending on decoupled mode.
+  gcsfuse_pattern = "/tmp/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record*"
+  local_decoupled_root = os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal", "c4", "en", "3.0.1")
+  local_pattern = os.path.join(local_decoupled_root, "c4-train.array_record*")
+  selected_pattern = gcsfuse_pattern
+  dataset_path = "/tmp/gcsfuse"
+  
+  if is_decoupled():
+    # Prefer local minimal dataset if gcsfuse data absent
+    if not glob.glob(gcsfuse_pattern) and glob.glob(local_pattern):
+      selected_pattern = local_pattern
+      dataset_path = os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets")
+    elif not glob.glob(gcsfuse_pattern) and not glob.glob(local_pattern):
+      pytest.skip("No grain ArrayRecord shards found for checkpointing test in decoupled mode.")
+
   grain_command = [
       "grain_worker_count=0",
-      "grain_train_files=/tmp/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record*",
+      f"grain_train_files={selected_pattern}",
   ]
   train_main(
       get_checkpointing_command(
@@ -127,7 +161,7 @@ def run_checkpointing(hardware, attention_type):
           metrics_file="saved_metrics.txt",
           attention_type=attention_type,
           dataset_type="grain",
-          dataset_path="/tmp/gcsfuse",
+          dataset_path=dataset_path,
       )
       + grain_command
   )
@@ -140,7 +174,7 @@ def run_checkpointing(hardware, attention_type):
           metrics_file="restored_metrics.txt",
           attention_type=attention_type,
           dataset_type="grain",
-          dataset_path="/tmp/gcsfuse",
+          dataset_path=dataset_path,
       )
       + grain_command
   )
diff --git a/tests/integration_tests/generate_param_only_checkpoint_test.py b/tests/integration_tests/generate_param_only_checkpoint_test.py
index 08a8c5a03c..7493faa0b3 100644
--- a/tests/integration_tests/generate_param_only_checkpoint_test.py
+++ b/tests/integration_tests/generate_param_only_checkpoint_test.py
@@ -18,9 +18,11 @@
 """
 from datetime import datetime
 import os
+from MaxText.gcloud_stub import is_decoupled
 import pytest
 
 from MaxText.globals import MAXTEXT_ASSETS_ROOT, MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.train import main as train_main
 from MaxText.decode import main as decode_main
 from MaxText.generate_param_only_checkpoint import main as generate_param_only_ckpt_main
@@ -41,11 +43,22 @@ def get_model_params(quantization):
 
 def run_e2e_test_flow(hardware, model_config, attention_type="autoselected", state_path=None):
   """Helper function to run training, generate parameter-only checkpoint, and decode."""
+  decoupled = is_decoupled()
+  base_output_directory = (
+      os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs")
+      if decoupled
+      else "gs://runner-maxtext-logs"
+  )
+  dataset_path = (
+    os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal")
+    if decoupled
+    else "gs://maxtext-dataset"
+  )
   run_date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
   test_config = [
       None,
-      os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-      "base_output_directory=gs://runner-maxtext-logs",
+      get_test_config_path(),
+      f"base_output_directory={base_output_directory}",
       "async_checkpointing=False",
       f"hardware={hardware}",
       f"attention={attention_type}",
@@ -67,10 +80,10 @@ def run_e2e_test_flow(hardware, model_config, attention_type="autoselected", sta
             metrics_file="run_metrics.txt",
             attention_type=attention_type,
             dataset_type="tfds",
-            dataset_path="gs://maxtext-dataset",
+            dataset_path=dataset_path,
         )
     )
-    state_path = f"gs://runner-maxtext-logs/runner_{run_date}/checkpoints/0/items"
+    state_path = f"{base_output_directory}/runner_{run_date}/checkpoints/0/items"
 
   # Generate parameter-only checkpoint
   generate_param_only_ckpt_config = (
@@ -88,7 +101,7 @@ def run_e2e_test_flow(hardware, model_config, attention_type="autoselected", sta
       test_config
       + [
           f"run_name=decode_{run_date}",
-          f"load_parameters_path=gs://runner-maxtext-logs/generate_param_{run_date}/checkpoints/0/items",
+          f"load_parameters_path={base_output_directory}/generate_param_{run_date}/checkpoints/0/items",
       ]
       + pathways_command
   )
@@ -107,6 +120,7 @@ def test_param_ckpt_generation_with_autoselected_attention(quantization, capsys)
   assert expected_output in captured.out
 
 
+@pytest.mark.external_serving
 @pytest.mark.integration_test
 @pytest.mark.gpu_only
 @pytest.mark.parametrize("quantization", [(""), ("int8")])
@@ -123,6 +137,7 @@ def test_param_ckpt_generation_with_dot_product(quantization, capsys):
 @pytest.mark.integration_test
 @pytest.mark.tpu_only
 @pytest.mark.scheduled_only
+@pytest.mark.external_serving  # Requires pre-generated checkpoint (Gemma-2b)
 def test_param_ckpt_generation_with_pre_generated_ckpt(capsys):
   """Tests the parameter-only checkpoint generation and decode flow with a pre-generated Gemma-2b model checkpoint."""
   model_config = [
diff --git a/tests/integration_tests/gradient_accumulation_test.py b/tests/integration_tests/gradient_accumulation_test.py
index 8e6db3043d..d2e37fc156 100644
--- a/tests/integration_tests/gradient_accumulation_test.py
+++ b/tests/integration_tests/gradient_accumulation_test.py
@@ -15,6 +15,8 @@
 """Integration tests for gradient accumulation."""
 
 import tempfile
+import pytest
+from MaxText.gcloud_stub import is_decoupled
 
 import numpy as np
 import json
@@ -26,6 +28,7 @@
 
 from MaxText.train import main as train_main
 from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT
+from maxtext.tests.test_utils import get_test_config_path
 
 
 def generate_random_string(length=10):
@@ -38,15 +41,26 @@ class GradientAccumulationTest(unittest.TestCase):
   @pytest.mark.integration_test
   @pytest.mark.tpu_only
   def test_grad_accumulate_same_loss(self):
+    decoupled = is_decoupled()
+    base_output_directory = (
+         os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs")
+         if decoupled
+          else "gs://runner-maxtext-logs"
+    )
+    dataset_path = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal")
+        if decoupled
+        else "gs://maxtext-dataset"
+    )
     random_suffix = generate_random_string()
     temp_dir = tempfile.gettempdir()
     run_accumulate_metrics_file = os.path.join(temp_dir, f"runner_grad_accumulate_{random_suffix}.txt")
     run_regular_metrics_file = os.path.join(temp_dir, f"runner_regular_{random_suffix}.txt")
     shared_maxtext_args = [
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
-        "dataset_path=gs://maxtext-dataset",
+        get_test_config_path(),
+        f"base_output_directory={base_output_directory}",
+        f"dataset_path={dataset_path}",
         "gradient_clipping_threshold=0",  # Ensures we are testing raw scales of gradients (clipping off)
         "enable_checkpointing=False",
         "enable_goodput_recording=False",
diff --git a/tests/integration_tests/grpo_correctness.py b/tests/integration_tests/grpo_correctness.py
index 3778617054..d0cd11e7b8 100644
--- a/tests/integration_tests/grpo_correctness.py
+++ b/tests/integration_tests/grpo_correctness.py
@@ -40,7 +40,9 @@
 from MaxText.common_types import MODEL_MODE_TRAIN
 from MaxText.globals import MAXTEXT_PKG_DIR
 from MaxText.layers import models
+import pytest
 
+pytestmark = [pytest.mark.external_serving] # uses pre-generated checkpoint
 
 class GRPOTest(unittest.TestCase):
 
diff --git a/tests/integration_tests/inference_microbenchmark_smoke_test.py b/tests/integration_tests/inference_microbenchmark_smoke_test.py
index 3ae010542d..cca4377e2f 100644
--- a/tests/integration_tests/inference_microbenchmark_smoke_test.py
+++ b/tests/integration_tests/inference_microbenchmark_smoke_test.py
@@ -19,10 +19,7 @@
 import unittest
 from absl.testing import absltest
 
-from MaxText import pyconfig
-from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT
-from MaxText.inference_microbenchmark import run_benchmarks
-
+pytestmark = [pytest.mark.external_serving]
 
 class Inference_Microbenchmark(unittest.TestCase):
   """integration test for inference microbenchmark"""
@@ -30,6 +27,12 @@ class Inference_Microbenchmark(unittest.TestCase):
   @pytest.mark.integration_test
   @pytest.mark.tpu_only
   def test(self):
+    # Lazy imports to avoid import-time side effects when deselected
+    import jax
+    from MaxText import pyconfig
+    from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT
+    from MaxText.inference_microbenchmark import run_benchmarks
+
     jax.config.update("jax_default_prng_impl", "unsafe_rbg")
     config = pyconfig.initialize(
         [
diff --git a/tests/integration_tests/sft_trainer_correctness_test.py b/tests/integration_tests/sft_trainer_correctness_test.py
index fe54296ff0..6872e9f8d9 100644
--- a/tests/integration_tests/sft_trainer_correctness_test.py
+++ b/tests/integration_tests/sft_trainer_correctness_test.py
@@ -25,6 +25,7 @@
 """
 
 import os.path
+import pytest
 
 import jsonlines
 import pytest
@@ -145,6 +146,7 @@ def get_token_log_probs(logits, inputs):
   return token_log_probs
 
 
+@pytest.mark.external_training # setUpClass does gsutil tokenizer
 class SFTTrainerCorrectnessTest(unittest.TestCase):
 
   @classmethod
diff --git a/tests/integration_tests/standalone_dl_ckpt_test.py b/tests/integration_tests/standalone_dl_ckpt_test.py
index 64b92e6686..7ba6174054 100644
--- a/tests/integration_tests/standalone_dl_ckpt_test.py
+++ b/tests/integration_tests/standalone_dl_ckpt_test.py
@@ -18,10 +18,12 @@
 from tools.gcs_benchmarks.standalone_checkpointer import main as sckpt_main
 from tools.gcs_benchmarks.standalone_dataloader import main as sdl_main
 from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT
+from maxtext.tests.test_utils import get_test_config_path
 from datetime import datetime
 import random
 import string
 import os.path
+from MaxText.gcloud_stub import is_decoupled
 
 
 class Standalone_DL_CKPT(unittest.TestCase):
@@ -38,13 +40,24 @@ def _get_random_test_name(self, test_name):
   @pytest.mark.tpu_only
   def test_standalone_dataloader(self):
     random_run_name = self._get_random_test_name("standalone_dataloader")
+    decoupled = is_decoupled()
+    base_output_directory = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs")
+        if decoupled
+        else "gs://runner-maxtext-logs"
+    )
+    dataset_path = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal")
+        if decoupled
+        else "gs://maxtext-dataset"
+    )
     sdl_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"run_name={random_run_name}",
-            "base_output_directory=gs://runner-maxtext-logs",
-            "dataset_path=gs://maxtext-dataset",
+            f"base_output_directory={base_output_directory}",
+            f"dataset_path={dataset_path}",
             "steps=100",
             "enable_checkpointing=false",
             "enable_goodput_recording=False",
@@ -55,15 +68,26 @@ def test_standalone_dataloader(self):
   @pytest.mark.integration_test
   @pytest.mark.tpu_only
   def test_standalone_checkpointer(self):
+    decoupled = is_decoupled()
+    base_output_directory = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs")
+        if decoupled
+        else "gs://runner-maxtext-logs"
+    )
+    dataset_path = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal")
+        if decoupled
+        else "gs://maxtext-dataset"
+    )
     random_run_name = self._get_random_test_name("standalone_checkpointer")
     # checkpoint at 50
     sckpt_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"run_name={random_run_name}",
-            "base_output_directory=gs://runner-maxtext-logs",
-            "dataset_path=gs://maxtext-dataset",
+            f"base_output_directory={base_output_directory}",
+            f"dataset_path={dataset_path}",
             "base_emb_dim=128",
             "base_num_query_heads=4",
             "base_num_kv_heads=4",
@@ -81,10 +105,10 @@ def test_standalone_checkpointer(self):
     sckpt_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"run_name={random_run_name}",
-            "base_output_directory=gs://runner-maxtext-logs",
-            "dataset_path=gs://maxtext-dataset",
+            f"base_output_directory={base_output_directory}",
+            f"dataset_path={dataset_path}",
             "base_emb_dim=128",
             "base_num_query_heads=4",
             "base_num_kv_heads=4",
diff --git a/tests/integration_tests/train_tests.py b/tests/integration_tests/train_tests.py
index cc81a8e063..3eedca7f6c 100644
--- a/tests/integration_tests/train_tests.py
+++ b/tests/integration_tests/train_tests.py
@@ -14,143 +14,165 @@
 
 """Tests for train.py with various configs"""
 import os
+from MaxText.gcloud_stub import is_decoupled
 import unittest
 import pytest
 import jax
 from MaxText.train import main as train_main
 from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT
+from maxtext.tests.test_utils import get_test_config_path
 from absl.testing import absltest
 
 
 class TrainTests(unittest.TestCase):
   """Tests train.py with various configs"""
+  decoupled = is_decoupled()
+  dev_count = jax.device_count()
+  _base_output_directory = (
+    os.path.join("local_datasets", "gcloud_decoupled_test_logs")
+    if decoupled
+    else "gs://runner-maxtext-logs"
+  )
+  dataset_path = (
+    os.path.join("local_datasets", "c4_en_dataset_minimal")
+    if decoupled
+    else "gs://maxtext-dataset"
+  )
+  
+  # FSDP override logic for tensor-parallel=4 configs: provide an axis only when cleanly divisible.
+  _fsdp_tp4_override = []
+  if decoupled:
+    if dev_count >= 4 and dev_count % 4 == 0:
+      _fsdp_tp4_override = [f"ici_fsdp_parallelism={dev_count // 4}"]
+    elif dev_count < 4:
+      _fsdp_tp4_override = [f"ici_fsdp_parallelism={dev_count}"]
 
   CONFIGS = {
       "base": [  # short test for train.py with TFDS c4
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "synthetic": [  # tests base config with synthetic dataset
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           "dataset_type=synthetic",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "pdb_lt_1": [  # tests base config with per_device_batch_size < 1
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           "per_device_batch_size=0.25",
           "ici_tensor_parallelism=4",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "tp_transpose": [  # tests base config with ici_tensor_transpose_parallelism=4
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "steps=2",
           "ici_tensor_transpose_parallelism=4",
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "int8": [  # tests base config with int8
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "quantization=int8",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "fp8": [  # tests base config with fp8
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "quantization=fp8",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "nanoo_fp8": [  # tests base config with nanoo_fp8
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "quantization=nanoo_fp8",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "te_fp8_delayedscaling": [  # tests base config with te_fp8_delayedscaling
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "quantization=te_fp8_delayedscaling",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "te_fp8_currentscaling": [  # tests base config with te_fp8_currentscaling
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "quantization=te_fp8_currentscaling",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "te_mxfp8": [  # tests base config with te_mxfp8
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "quantization=te_mxfp8",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "dropout": [  # tests base config with dropout
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
-          "dataset_path=gs://maxtext-dataset",
+          f"dataset_path={dataset_path}",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
@@ -158,20 +180,20 @@ class TrainTests(unittest.TestCase):
           "per_device_batch_size=1",
           "dropout_rate=0.02",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
       "hf_input_pipeline": [  # test for train.py with TFDS c4, using HF input pipeline
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-          "base_output_directory=gs://runner-maxtext-logs",
+          get_test_config_path(),
+          f"base_output_directory={_base_output_directory}",
           "run_name=runner_test",
           "steps=2",
           "enable_checkpointing=False",
           "enable_goodput_recording=False",
           "dataset_type=hf",
           "hf_path=parquet",
-          "hf_train_files=gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet",
+          f"hf_train_files={dataset_path}/hf/c4/c4-train-00000-of-01637.parquet",
           "tokenizer_path=google-t5/t5-large",
-      ],
+      ] + ([f"ici_fsdp_parallelism={dev_count}"] if decoupled else []),
   }
 
   @pytest.mark.integration_test
@@ -207,7 +229,11 @@ def test_tpu_pdb_lt_1(self):
   @pytest.mark.integration_test
   @pytest.mark.gpu_only
   def test_gpu_pdb_lt_1(self):
-    train_main(TrainTests.CONFIGS["pdb_lt_1"] + ["attention=dot_product"])
+    # In decoupled (offline) mode this fractional batch config produces zero TFLOPs and a divide-by-zero in logging.
+    if self.decoupled:
+      pytest.skip("Skipping pdb_lt_1 in decoupled mode: known divide by zero in TFLOPs logging for per_device_batch_size < 1.")
+    cfg = TrainTests.CONFIGS["pdb_lt_1"] + ["attention=dot_product"]
+    train_main(cfg)
 
   @pytest.mark.integration_test
   @pytest.mark.tpu_only
@@ -224,11 +250,13 @@ def test_gpu_int8(self):
   def test_tpu_fp8(self):
     train_main(TrainTests.CONFIGS["fp8"])
 
+  @pytest.mark.external_serving
   @pytest.mark.integration_test
   @pytest.mark.gpu_only
   def test_gpu_fp8(self):
     train_main(TrainTests.CONFIGS["fp8"] + ["attention=dot_product"])
 
+  @pytest.mark.external_serving
   @pytest.mark.integration_test
   @pytest.mark.gpu_only
   def test_gpu_nanoo_fp8(self):
@@ -274,6 +302,7 @@ def test_gpu_dropout(self):
   def test_tpu_hf_input_pipeline(self):
     train_main(TrainTests.CONFIGS["hf_input_pipeline"])
 
+  @pytest.mark.external_serving
   @pytest.mark.integration_test
   @pytest.mark.gpu_only
   def test_gpu_hf_input_pipeline(self):
@@ -282,13 +311,15 @@ def test_gpu_hf_input_pipeline(self):
   @pytest.mark.integration_test
   @pytest.mark.gpu_only
   def test_gpu_cudnn_flash_te(self):
+    if not jax.local_devices() or jax.local_devices()[0].platform != "cuda":
+      pytest.skip("Skipping cudnn_flash_te test: CUDA/cuDNN not available")
     os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
     cudnn_flash_te = [  # tests base config on GPU with flash attention
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=2",
         "enable_checkpointing=False",
         "enable_goodput_recording=False",
@@ -304,10 +335,10 @@ def test_gpu_context_parallelism(self):
     os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
     context_parallel = [  # tests base config on GPU with All-Gather based context parallelism
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=10",
         "enable_checkpointing=False",
         "enable_goodput_recording=False",
@@ -319,6 +350,13 @@ def test_gpu_context_parallelism(self):
         "packing=False",
         rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
     ]
+    if self.decoupled:
+      context_parallel.append("shardy=False")
+      axis = next((int(a.split("=")[1]) for a in context_parallel if isinstance(a, str) and a.startswith("ici_context_parallelism=")), 1)
+      fsdp = self.dev_count // axis if axis > 0 and self.dev_count % axis == 0 else self.dev_count
+      context_parallel.append(f"ici_fsdp_parallelism={fsdp}")
+    print("Using dataset_path:", self.dataset_path)
+    print("Exists:", os.path.exists(self.dataset_path))
     train_main(context_parallel)
 
   @pytest.mark.integration_test
@@ -327,10 +365,10 @@ def test_gpu_tensor_parallelism(self):
     os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
     tensor_parallel = [  # tests base config on GPU with Tensor Parallelism
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=10",
         "enable_checkpointing=False",
         "enable_goodput_recording=False",
@@ -340,6 +378,11 @@ def test_gpu_tensor_parallelism(self):
         "packing=False",
         rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
     ]
+    if self.decoupled:
+      tensor_parallel.append("shardy=False")
+      axis = next((int(a.split("=")[1]) for a in tensor_parallel if isinstance(a, str) and a.startswith("ici_tensor_parallelism=")), 1)
+      fsdp = self.dev_count // axis if axis > 0 and self.dev_count % axis == 0 else self.dev_count
+      tensor_parallel.append(f"ici_fsdp_parallelism={fsdp}")
     train_main(tensor_parallel)
 
   @pytest.mark.integration_test
@@ -348,10 +391,10 @@ def test_gpu_optimizer_offload(self):
     os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
     optimizer_offload = [  # tests base config on GPU with optimizer state offload
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=10",
         "attention=dot_product",
         "optimizer_memory_host_offload=True",  # enable optimizer state offload
@@ -360,7 +403,7 @@ def test_gpu_optimizer_offload(self):
         "enable_goodput_recording=False",
         rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
     ]
-    train_main(optimizer_offload)
+    train_main(optimizer_offload + ([f"ici_fsdp_parallelism={self.dev_count}"] if self.decoupled else []))
 
   @pytest.mark.integration_test
   @pytest.mark.gpu_only
@@ -368,10 +411,10 @@ def test_gpu_parameter_offload(self):
     os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
     parameter_offload = [  # tests base config on GPU with parameter offload
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=10",
         "param_scan_axis=0",  # scan axis 0 is required for parameter offload
         "attention=dot_product",
@@ -381,16 +424,18 @@ def test_gpu_parameter_offload(self):
         "enable_goodput_recording=False",
         rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
     ]
-    train_main(parameter_offload)
+    train_main(parameter_offload + ([f"ici_fsdp_parallelism={self.dev_count}"] if self.decoupled else []))
 
   @pytest.mark.gpu_only
   def test_gpu_cudnn_flash_jax(self):
+    if not jax.local_devices() or jax.local_devices()[0].platform != "cuda":
+      pytest.skip("Skipping cudnn_flash_jax test: CUDA/cuDNN not available")
     cudnn_flash_jax = [  # tests base config on GPU with flash attention
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=2",
         "enable_checkpointing=False",
         "enable_goodput_recording=False",
@@ -422,10 +467,10 @@ def test_gpu_zero1_gradient_accumulation(self):
     os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
     zero1_ga = [  # tests Zero-1 optimizer sharding with gradient accumulation
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=10",
         "enable_checkpointing=False",
         "enable_goodput_recording=False",
@@ -450,16 +495,21 @@ def test_gpu_zero1_gradient_accumulation(self):
   @pytest.mark.gpu_only
   def test_gpu_packed_attention(self):
     gpu_device = jax.devices("gpu")[0]
-    compute_capability = gpu_device.compute_capability
-    if float(compute_capability) < 9.0:
-      pytest.skip("Packed (THD) attention is only supported on sm90+!")
+    compute_capability = getattr(gpu_device, "compute_capability", None)
+    try:
+      if float(compute_capability) < 9.0:
+        pytest.skip("Packed (THD) attention is only supported on sm90+!")
+    except Exception:
+      # Non-numeric or unknown capability (e.g. ROCm 'gfx942') — skip the test.
+      print("checking if Packed THD attention is supported on ROCm...")
+      #pytest.skip("Packed (THD) attention is only supported on sm90+!")
     os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
     packed_attention = [  # tests base config on GPU with Packed (THD) attention
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=10",
         "enable_checkpointing=False",
         "enable_goodput_recording=False",
@@ -477,10 +527,10 @@ def test_gpu_ring_attention(self):
     os.environ["NVTE_FUSED_RING_ATTENTION_USE_SCAN"] = "0"  # Disable scan for ring attention
     ring_attention = [  # tests base config on GPU with ring attention
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-        "base_output_directory=gs://runner-maxtext-logs",
+        get_test_config_path(),
+        f"base_output_directory={self._base_output_directory}",
         "run_name=runner_test",
-        "dataset_path=gs://maxtext-dataset",
+        f"dataset_path={self.dataset_path}",
         "steps=10",
         "enable_checkpointing=False",
         "enable_goodput_recording=False",
@@ -498,3 +548,4 @@ def test_gpu_ring_attention(self):
 
 if __name__ == "__main__":
   absltest.main()
+
diff --git a/tests/integration_tests/vision_encoder_test.py b/tests/integration_tests/vision_encoder_test.py
index 3cc3ad56dc..11c5588e5a 100644
--- a/tests/integration_tests/vision_encoder_test.py
+++ b/tests/integration_tests/vision_encoder_test.py
@@ -33,8 +33,10 @@
 from MaxText import multimodal_utils
 from MaxText.layers import models
 from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_TEST_ASSETS_ROOT, MAXTEXT_ASSETS_ROOT
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText import maxengine
 
+pytestmark = [pytest.mark.external_serving]  # uses pre-generated multimodal checkpoint
 
 # 4b with vit
 DEFAULT_LOAD_PARAMETERS_PATH = (
@@ -47,7 +49,7 @@ class VisionEncoderEmbeddingTest(unittest.TestCase):
   CONFIGS = {
       "gemma3-4b": [  # tests decode with multimodal gemma-4b
           None,
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+          get_test_config_path(),
           "model_name=gemma3-4b",
           rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.gemma3')}",
           "use_multimodal=True",
diff --git a/tests/max_utils_test.py b/tests/max_utils_test.py
index 3e9b1dac6d..09963e95dd 100644
--- a/tests/max_utils_test.py
+++ b/tests/max_utils_test.py
@@ -119,7 +119,7 @@ def init_pyconfig(self, **kwargs):
         "model_name": "llama3.1-8b",
     } | kwargs
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **init_kwargs,
     )
     return config
diff --git a/tests/maxengine_test.py b/tests/maxengine_test.py
index e58b203c17..8ebad2cf55 100644
--- a/tests/maxengine_test.py
+++ b/tests/maxengine_test.py
@@ -30,10 +30,12 @@
 from MaxText import pyconfig, maxengine
 from MaxText.common_types import DECODING_ACTIVE_SEQUENCE_INDICATOR, MODEL_MODE_PREFILL
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.layers import models
 from MaxText.layers import quantizations
 from MaxText.maxengine import MaxEngine
 
+pytestmark = [pytest.mark.external_serving]
 
 class MaxEngineTest(unittest.TestCase):
   """Tests for MaxEngine."""
@@ -61,7 +63,7 @@ def init_pyconfig(self, **kwargs):
         "return_log_prob": True,
     } | kwargs
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **init_kwargs,
     )
     return config
@@ -79,7 +81,7 @@ def get_data(self):
 
   def test_stack_and_unstack_prefill_cache(self):
     config = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         enable_checkpointing=False,
         stack_prefill_result_cache=True,
     )
diff --git a/tests/maxtext_utils_test.py b/tests/maxtext_utils_test.py
index 30a0032f89..a497ea9acd 100644
--- a/tests/maxtext_utils_test.py
+++ b/tests/maxtext_utils_test.py
@@ -39,6 +39,8 @@
 from MaxText import pyconfig
 from MaxText.common_types import MODEL_MODE_TRAIN
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 from MaxText.layers import models
 from MaxText.layers import quantizations
 from MaxText.sharding import assert_params_sufficiently_sharded, get_formatted_sharding_annotations
@@ -185,7 +187,7 @@ class MaxUtilsInitStateWithMultipleCollections(unittest.TestCase):
 
   def setUp(self):
     self.config = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")], enable_checkpointing=False
+        [None, get_test_config_path()], enable_checkpointing=False
     )
     self.model = ModelWithMultipleCollections(self.config.max_target_length, nnx.Rngs(0))
     self.key = random.key(0)
@@ -236,8 +238,10 @@ class MaxUtilsInitTransformerState(unittest.TestCase):
   """Tests initialization of transformer states in max_utils.py"""
 
   def setUp(self):
+    # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
     self.config = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")], enable_checkpointing=False
+        [None, get_test_config_path()], enable_checkpointing=False, **extra_args
     )
     devices_array = maxtext_utils.create_device_mesh(self.config)
     self.mesh = Mesh(devices_array, self.config.mesh_axes)
@@ -366,7 +370,8 @@ def test_multi_axis_sharding_pass(self):
     multi-dimensional mesh passes the assertion.
     """
     # Create a mesh shape for a 5D mesh.
-    devices = np.array(jax.devices()).reshape((4, 1, 1, 1, 1))
+    num_devices = jax.device_count()
+    devices = np.array(jax.devices()).reshape((num_devices, 1, 1, 1, 1))
     mesh = Mesh(devices, self.mesh_axes)
 
     # Shard across multiple axes, including the valid 'fsdp' axis.
@@ -381,7 +386,8 @@ def test_multi_axis_not_sharded_fails(self):
     Tests that a tensor on a complex mesh fails if it's not sharded along any
     of the primary valid axes (like 'fsdp').
     """
-    devices = np.array(jax.devices()).reshape((4, 1, 1, 1, 1))
+    num_devices = jax.device_count()
+    devices = np.array(jax.devices()).reshape((num_devices, 1, 1, 1, 1))
     mesh = Mesh(devices, self.mesh_axes)
     pspec = PartitionSpec(("sequence", "context"), "stage", "tensor", None)
     params = {"complex_layer": jax.device_put(jnp.ones((8, 8, 2, 2)), NamedSharding(mesh, pspec))}
@@ -393,7 +399,8 @@ def test_multi_axis_mixed_sharding_fails(self):
     """
     Tests that a mix of sharded (correctly) and unsharded tensors on a complex mesh fails.
     """
-    devices = np.array(jax.devices()).reshape((4, 1, 1, 1, 1))
+    num_devices = jax.device_count()
+    devices = np.array(jax.devices()).reshape((num_devices, 1, 1, 1, 1))
     mesh = Mesh(devices, self.mesh_axes)
     sharded_pspec = PartitionSpec(("fsdp", "sequence"), "stage", ("tensor"), None)
     sharded_param = jax.device_put(jnp.ones((8, 8, 2, 2)), NamedSharding(mesh, sharded_pspec))
@@ -420,7 +427,8 @@ def setUp(self):
       self.skipTest("This test suite requires at least 4 TPU devices")
 
     self.mesh_axes = ("fsdp", "sequence", "tensor", "stage", "context")
-    devices = np.array(jax.devices()).reshape((4, 1, 1, 1, 1))
+    num_devices = jax.device_count()
+    devices = np.array(jax.devices()).reshape((num_devices, 1, 1, 1, 1))
     self.mesh = Mesh(devices, self.mesh_axes)
 
   def test_multi_axis_mixed_formating(self):
diff --git a/tests/model_test.py b/tests/model_test.py
index 99ff9e77bc..426875c849 100644
--- a/tests/model_test.py
+++ b/tests/model_test.py
@@ -30,6 +30,8 @@
 from MaxText import pyconfig
 from MaxText.common_types import DECODING_ACTIVE_SEQUENCE_INDICATOR, MODEL_MODE_TRAIN, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 from MaxText.layers import models
 from MaxText.layers import quantizations
 
@@ -47,8 +49,10 @@ def setUp(self):
 
   def init_pyconfig(self, **kwargs):
     """Init pyconfig."""
+    # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         per_device_batch_size=1.0,
         run_name="test",
         enable_checkpointing=False,
@@ -60,6 +64,7 @@ def init_pyconfig(self, **kwargs):
         base_num_kv_heads=2,
         max_prefill_predict_length=4,
         **kwargs,
+        **extra_args,
     )
     return config
 
diff --git a/tests/moe_test.py b/tests/moe_test.py
index 848d9b5589..1f0b8fdde1 100644
--- a/tests/moe_test.py
+++ b/tests/moe_test.py
@@ -30,6 +30,8 @@
 from MaxText import pyconfig
 from MaxText.common_types import Config, DType
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 from MaxText.layers import linears
 from MaxText.layers import moe
 from MaxText.layers.initializers import NdInitializer, nd_dense_init, variable_to_logically_partitioned
@@ -41,8 +43,9 @@ class TokenDroppingTest(unittest.TestCase):
 
   def setUp(self):
     super().setUp()
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
     self.cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="token_dropping_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -52,6 +55,7 @@ def setUp(self):
         max_target_length=80,
         per_device_batch_size=1,
         capacity_factor=2,
+        **extra_args,
     )
     self.rngs = nnx.Rngs(params=0)
     devices_array = maxtext_utils.create_device_mesh(self.cfg)
@@ -166,7 +170,7 @@ class MlpBlockTest(unittest.TestCase):
   def setUp(self):
     super().setUp()
     self.config = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="mlp_block_init_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -194,6 +198,7 @@ def setUp(self):
         use_bias=True,
     )
 
+  @pytest.mark.external_serving
   def test_init(self):
     x = jnp.array([1.0, 2.0]).reshape((1, 1, 2))  # TODO(bug): need reshape due to error
     self.model.init({"params": self.rng, "dropout": self.rng}, x)
@@ -203,8 +208,10 @@ class DeepSeekRoutingTest(unittest.TestCase):
 
   def setUp(self):
     super().setUp()
+    # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
     self.cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="deepseek_routing_test",
         enable_checkpointing=False,
         decoder_block="deepseek",
@@ -217,6 +224,7 @@ def setUp(self):
         num_experts=16,
         num_experts_per_tok=4,
         sparse_matmul=True,
+        **extra_args,
     )
     self.rngs = nnx.Rngs(params=0)
     devices_array = maxtext_utils.create_device_mesh(self.cfg)
@@ -437,7 +445,7 @@ def get_moe_output(self, variables, hidden_states, cfg, mesh):
   @pytest.mark.tpu_only
   def test_megablox(self):
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="moe_block_megablox_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -465,7 +473,7 @@ def test_megablox(self):
   @pytest.mark.tpu_only
   def test_ragged_dot(self):
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="moe_block_ragged_dot_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -493,7 +501,7 @@ def test_ragged_dot(self):
   @pytest.mark.tpu_only
   def test_dense(self):
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="moe_block_dense_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -521,7 +529,7 @@ def test_dense(self):
   @pytest.mark.tpu_only
   def test_megablox_expert_parallelism(self):
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="moe_block_megablox_ep_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -551,7 +559,7 @@ def test_megablox_expert_parallelism(self):
   @pytest.mark.tpu_only
   def test_moe_fsdp_two_stage_parallelism_tpu_only(self):
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="moe_block_megablox_ep_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -583,7 +591,7 @@ def test_moe_fsdp_two_stage_parallelism_tpu_only(self):
   @pytest.mark.tpu_only
   def test_megablox_tp_transpose_parallelism(self):
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="moe_block_megablox_tp_transpose_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -596,7 +604,7 @@ def test_megablox_tp_transpose_parallelism(self):
     )
 
     cfg2 = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="moe_block_megablox_tp_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
@@ -628,7 +636,7 @@ def test_megablox_tp_transpose_parallelism(self):
   @pytest.mark.tpu_only
   def test_megablox_context_parallelism(self):
     cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="moe_block_megablox_cp_test",
         enable_checkpointing=False,
         model_name="mixtral-8x7b",
diff --git a/tests/multi_token_prediction_test.py b/tests/multi_token_prediction_test.py
index e027634156..93797367a2 100644
--- a/tests/multi_token_prediction_test.py
+++ b/tests/multi_token_prediction_test.py
@@ -26,6 +26,8 @@
 from MaxText import maxtext_utils
 from MaxText.globals import MAXTEXT_PKG_DIR
 from MaxText.layers.decoders import DecoderLayer
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 from MaxText.layers import multi_token_prediction  # The class under test
 from MaxText.layers import embeddings
 from MaxText.common_types import MODEL_MODE_TRAIN
@@ -38,11 +40,14 @@ class MultiTokenPredictionLayerTest(unittest.TestCase):
 
   def setUp(self):
     super().setUp()
+    # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
     self.cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="multi_token_prediction_layer_test",
         skip_jax_distributed_system=True,
         per_device_batch_size=8,
+        **extra_args,
     )
     self.rng = jax.random.PRNGKey(42)  # Base RNG for setup
     self.rngs = nnx.Rngs(params=self.rng, dropout=self.rng)
@@ -192,12 +197,15 @@ class MultiTokenPredictionBlockTest(unittest.TestCase):
 
   def setUp(self):
     super().setUp()
+    # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
     self.cfg = pyconfig.initialize(
-        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [None, get_test_config_path()],
         run_name="mtp_block_test",
         skip_jax_distributed_system=True,
         mtp_num_layers=2,
         base_emb_dim=16,
+        **extra_args,
     )
     self.nnx_rngs = nnx.Rngs(params=0)
     self.rng = jax.random.PRNGKey(43)
diff --git a/tests/multihost_dataloading_test.py b/tests/multihost_dataloading_test.py
index d0c0b8d441..a7b977ffac 100644
--- a/tests/multihost_dataloading_test.py
+++ b/tests/multihost_dataloading_test.py
@@ -31,23 +31,35 @@
 from MaxText import pyconfig
 from MaxText import multihost_dataloading
 from MaxText.globals import MAXTEXT_PKG_DIR
-
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 
 class MultihostDataloadingTest(unittest.TestCase):
 
   def setUp(self):
     super().setUp()
+    decoupled = is_decoupled()
+    base_output_directory = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs")
+        if decoupled
+        else "gs://max-experiments/"
+    )
+    dataset_path = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal")
+        if decoupled
+        else "gs://maxtext-dataset/"
+    )
     batch_size = 4
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
-        per_device_batch_size=1,
-        run_name="test",
-        mesh_axes=["data"],
-        logical_axis_rules=[["batch", "data"]],
-        data_sharding=["data"],
-        base_output_directory="gs://max-experiments/",
-        dataset_path="gs://maxtext-dataset/",
-        enable_checkpointing=False,
+      [sys.argv[0], get_test_config_path()],
+      f"base_output_directory={base_output_directory}",
+      f"dataset_path={dataset_path}",
+      per_device_batch_size=1,
+      run_name="test",
+      mesh_axes=["data"],
+      logical_axis_rules=[["batch", "data"]],
+      data_sharding=["data"],
+      enable_checkpointing=False,
     )
     global_data_shape = PartitionSpec(batch_size, config.max_target_length)
     mesh_shape_1d = (len(jax.devices()),)
diff --git a/tests/offline_engine_test.py b/tests/offline_engine_test.py
index 0e599ed93a..69acc33df1 100644
--- a/tests/offline_engine_test.py
+++ b/tests/offline_engine_test.py
@@ -17,6 +17,7 @@
 import sys
 import unittest
 import os.path
+import pytest
 
 import jax
 import jax.numpy as jnp
@@ -25,6 +26,8 @@
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
 
+pytestmark = [pytest.mark.external_serving]
+from maxtext.tests.test_utils import get_test_config_path
 
 class OfflineEngineTest(unittest.TestCase):
   """Tests for JetStream Offline Engine.
@@ -59,7 +62,7 @@ def init_pyconfig(self, **kwargs):
         "skip_jax_distributed_system": True,
     } | kwargs
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **init_kwargs,
     )
     return config
diff --git a/tests/pipeline_parallelism_test.py b/tests/pipeline_parallelism_test.py
index 43efb62ca0..2941feb204 100644
--- a/tests/pipeline_parallelism_test.py
+++ b/tests/pipeline_parallelism_test.py
@@ -33,11 +33,22 @@
 from MaxText import pyconfig
 from MaxText.common_types import MODEL_MODE_TRAIN
 from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 from MaxText.layers import pipeline
 from MaxText.layers import simple_layer
 from MaxText.train import main as train_main
 from MaxText.layers import deepseek
 
+# Helper to fix pipeline parallelism in test_full_train_fp8 and test_full_train_nanoo_fp8
+def _adapt_parallelism(args, pipeline_stages=4):
+  dc = jax.device_count()
+  args.append(f"ici_pipeline_parallelism={pipeline_stages}")
+  if dc >= pipeline_stages:
+    data_par = dc // pipeline_stages
+    if data_par > 1:
+      args.append(f"ici_data_parallelism={data_par}")
+
 
 def assert_same_output_and_grad(f1, f2, *inputs):
   """check that the output and gradient are the same"""
@@ -57,7 +68,17 @@ def pytree_ravel(pytree):
 
 
 class PipelineParallelismTest(unittest.TestCase):
-
+  decoupled = is_decoupled()
+  base_output_directory = (
+      os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs")
+      if decoupled
+      else "gs://runner-maxtext-logs"
+  )
+  dataset_path = (
+      os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal")
+      if decoupled
+     else "gs://maxtext-dataset"
+  )
   def assert_pipeline_same_output_and_grad(self, config, single_pipeline_stage_class=None):
     """check that the output and gradient are the same"""
     devices_array = maxtext_utils.create_device_mesh(config)
@@ -181,7 +202,7 @@ def regular_sequential_layers_dummy_loss(
   def test_circular_minimum_microbatches_same_output_and_grad(self):
     # 4 stages, 8 layers (2 repeats, 1 layer per stage), 4 microbatches
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         enable_goodput_recording=False,
         run_name="circular_minimum_microbatches",
@@ -198,7 +219,7 @@ def test_circular_minimum_microbatches_same_output_and_grad(self):
   def test_circular_extra_microbatches_same_output_and_grad(self):
     # 4 stages, 8 layers (2 repeats, 1 layer per stage), 8 microbatches
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         enable_goodput_recording=False,
         run_name="circular_extra_microbatches",
@@ -215,7 +236,7 @@ def test_circular_extra_microbatches_same_output_and_grad(self):
   def test_circular_deepseek_megablox_same_output_and_grad(self):
     # 4 stages, 8 layers (2 repeats, 1 layer per stage), 8 microbatches
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         enable_goodput_recording=False,
         run_name="circular_moe",
@@ -238,7 +259,7 @@ def test_circular_deepseek_megablox_same_output_and_grad(self):
   def test_circular_ag_once(self):
     # 2 stages, 8 microbatches, all gather once
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         enable_goodput_recording=False,
         run_name="circular_ag_once",
@@ -256,7 +277,7 @@ def test_circular_ag_once(self):
   def test_non_circular_same_output_and_grad(self):
     # 4 stages, 4 layers (no circular repeats, 1 layer per stage), 4 microbatches
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         run_name="non_circular",
         max_target_length=128,
@@ -275,10 +296,10 @@ def test_full_train_circular(self):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-            "base_output_directory=gs://runner-maxtext-logs",
+            get_test_config_path(),
+            f"base_output_directory={self.base_output_directory}",
             "run_name=runner_pipeline_parallelism_test",
-            "dataset_path=gs://maxtext-dataset",
+            f"dataset_path={self.dataset_path}",
             "base_emb_dim=28",
             "base_num_query_heads=4",
             "base_num_kv_heads=4",
@@ -304,7 +325,7 @@ def test_full_train_circular(self):
   def test_delay_activation_forwarding_same_output_and_grad(self):
     # 4 stages, delayed activation forwarding, 8 layers (2 repeats, 1 layer per stage), 8 microbatches
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         enable_goodput_recording=False,
         run_name="activation_forwarding",
@@ -325,10 +346,10 @@ def test_full_train_non_circular(self):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-            "base_output_directory=gs://runner-maxtext-logs",
+            get_test_config_path(),
+            f"base_output_directory={self.base_output_directory}",
             "run_name=runner_pipeline_parallelism_test",
-            "dataset_path=gs://maxtext-dataset",
+            f"dataset_path={self.dataset_path}",
             "base_emb_dim=28",
             "base_num_query_heads=4",
             "base_num_kv_heads=4",
@@ -357,10 +378,10 @@ def test_subset_layers(self):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-            "base_output_directory=gs://runner-maxtext-logs",
+            get_test_config_path(),
+            f"base_output_directory={self.base_output_directory}",
             "run_name=runner_pipeline_parallelism_test",
-            "dataset_path=gs://maxtext-dataset",
+            f"dataset_path={self.dataset_path}",
             "base_emb_dim=28",
             "base_num_query_heads=4",
             "base_num_kv_heads=4",
@@ -388,65 +409,65 @@ def test_subset_layers(self):
   def test_full_train_fp8(self):
     # Run a full train.py call with fp8 quantization, which adds extra
     # variable collections that need to be handled
-    train_main(
-        [
-            None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-            "base_output_directory=gs://runner-maxtext-logs",
-            "run_name=runner_pipeline_parallelism_fp8_test",
-            "dataset_path=gs://maxtext-dataset",
-            "base_emb_dim=28",
-            "base_num_query_heads=4",
-            "base_num_kv_heads=4",
-            "base_mlp_dim=32",
-            "base_num_decoder_layers=4",
-            "head_dim=128",
-            "per_device_batch_size=2",
-            "max_target_length=1024",
-            "vocab_size=32",
-            "dataset_type=synthetic",
-            "steps=3",
-            "enable_checkpointing=False",
-            "enable_goodput_recording=False",
-            "ici_pipeline_parallelism=4",
-            rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-            "quantization=fp8",
-            "scan_layers_per_stage=False",
-            "attention=dot_product",
-        ]
-    )
+    args = [
+        None,
+        get_test_config_path(),
+        f"base_output_directory={self.base_output_directory}",
+        "run_name=runner_pipeline_parallelism_test",
+        f"dataset_path={self.dataset_path}",
+        "base_emb_dim=28",
+        "base_num_query_heads=4",
+        "base_num_kv_heads=4",
+        "base_mlp_dim=32",
+        "base_num_decoder_layers=4",
+        "head_dim=128",
+        "per_device_batch_size=2",
+        "max_target_length=1024",
+        "vocab_size=32",
+        "dataset_type=synthetic",
+        "steps=3",
+        "enable_checkpointing=False",
+        "enable_goodput_recording=False",
+        "ici_pipeline_parallelism=4",
+        rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
+        "quantization=fp8",
+        "scan_layers_per_stage=False",
+        "attention=dot_product",
+    ]
+    _adapt_parallelism(args, pipeline_stages=4)
+    train_main(args)
 
   @pytest.mark.integration_test
   def test_full_train_nanoo_fp8(self):
     # Run a full train.py call with NANOO fp8 quantization, which adds extra
     # variable collections that need to be handled
-    train_main(
-        [
-            None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
-            "base_output_directory=gs://runner-maxtext-logs",
-            "run_name=runner_pipeline_parallelism_nanoo_fp8_test",
-            "dataset_path=gs://maxtext-dataset",
-            "base_emb_dim=28",
-            "base_num_query_heads=4",
-            "base_num_kv_heads=4",
-            "base_mlp_dim=32",
-            "base_num_decoder_layers=4",
-            "head_dim=128",
-            "per_device_batch_size=2",
-            "max_target_length=1024",
-            "vocab_size=32",
-            "dataset_type=synthetic",
-            "steps=3",
-            "enable_checkpointing=False",
-            "enable_goodput_recording=False",
-            "ici_pipeline_parallelism=4",
-            rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
-            "quantization=nanoo_fp8",
-            "scan_layers_per_stage=False",
-            "attention=dot_product",
-        ]
-    )
+    args = [
+        None,
+        get_test_config_path(),
+        f"base_output_directory={self.base_output_directory}",
+        "run_name=runner_pipeline_parallelism_test",
+        f"dataset_path={self.dataset_path}",
+        "base_emb_dim=28",
+        "base_num_query_heads=4",
+        "base_num_kv_heads=4",
+        "base_mlp_dim=32",
+        "base_num_decoder_layers=4",
+        "head_dim=128",
+        "per_device_batch_size=2",
+        "max_target_length=1024",
+        "vocab_size=32",
+        "dataset_type=synthetic",
+        "steps=3",
+        "enable_checkpointing=False",
+        "enable_goodput_recording=False",
+        "ici_pipeline_parallelism=4",
+        rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
+        "quantization=nanoo_fp8",
+        "scan_layers_per_stage=False",
+        "attention=dot_product",
+    ]
+    _adapt_parallelism(args, pipeline_stages=4)
+    train_main(args)
 
 
 if __name__ == "__main__":
diff --git a/tests/profiler_test.py b/tests/profiler_test.py
index adc1a747c3..a5a34c1048 100644
--- a/tests/profiler_test.py
+++ b/tests/profiler_test.py
@@ -21,6 +21,7 @@
 from MaxText import profiler
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 
 
 class ProfilerTest(unittest.TestCase):
@@ -30,7 +31,7 @@ class ProfilerTest(unittest.TestCase):
   @pytest.mark.tpu_only
   def test_periodic_profiler_third_period_starts(self):
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         run_name="test_periodic_profiler_starts_after_regular_profile",
         profiler="xplane",
@@ -46,7 +47,7 @@ def test_periodic_profiler_third_period_starts(self):
   @pytest.mark.tpu_only
   def test_periodic_profiler_not_start_middle_period(self):
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         run_name="test_periodic_profiler_starts_after_regular_profile",
         profiler="xplane",
@@ -62,7 +63,7 @@ def test_periodic_profiler_not_start_middle_period(self):
   @pytest.mark.tpu_only
   def test_periodic_profiler_third_period_ends(self):
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         run_name="test_periodic_profiler_starts_after_regular_profile",
         profiler="xplane",
@@ -78,7 +79,7 @@ def test_periodic_profiler_third_period_ends(self):
   @pytest.mark.tpu_only
   def test_periodic_profiler_third_period_middle_not_end(self):
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         enable_checkpointing=False,
         run_name="test_periodic_profiler_starts_after_regular_profile",
         profiler="xplane",
diff --git a/tests/pyconfig_test.py b/tests/pyconfig_test.py
index 24691dfb78..2f127cdddc 100644
--- a/tests/pyconfig_test.py
+++ b/tests/pyconfig_test.py
@@ -19,6 +19,7 @@
 
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.pyconfig import resolve_config_path
 
 
@@ -27,7 +28,7 @@ class PyconfigTest(unittest.TestCase):
 
   def test_empty_string_parse_as_empty_string(self):
     config = pyconfig.initialize(
-        [os.path.join(MAXTEXT_PKG_DIR, "train.py"), os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [os.path.join(MAXTEXT_PKG_DIR, "train.py"), get_test_config_path()],
         skip_jax_distributed_system=True,  # We should check for this automatically instead - b/407047411
         quantization="",
     )
@@ -36,7 +37,7 @@ def test_empty_string_parse_as_empty_string(self):
 
   def test_multiple_unmodifiable_configs(self):
     config_train = pyconfig.initialize(
-        [os.path.join(MAXTEXT_PKG_DIR, "train.py"), os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [os.path.join(MAXTEXT_PKG_DIR, "train.py"), get_test_config_path()],
         per_device_batch_size=1.0,
         run_name="test",
         enable_checkpointing=False,
@@ -51,7 +52,7 @@ def test_multiple_unmodifiable_configs(self):
         ici_fsdp_parallelism=4,
     )
     config_inference = pyconfig.initialize(
-        [os.path.join(MAXTEXT_PKG_DIR, "decode.py"), os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [os.path.join(MAXTEXT_PKG_DIR, "decode.py"), get_test_config_path()],
         per_device_batch_size=1.0,
         run_name="test",
         enable_checkpointing=False,
@@ -74,7 +75,7 @@ def test_multiple_unmodifiable_configs(self):
 
   def test_overriding_model(self):
     config = pyconfig.initialize(
-        [os.path.join(MAXTEXT_PKG_DIR, "train.py"), os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [os.path.join(MAXTEXT_PKG_DIR, "train.py"), get_test_config_path()],
         skip_jax_distributed_system=True,
         model_name="gemma-7b",
         override_model_config=True,
diff --git a/tests/quantizations_test.py b/tests/quantizations_test.py
index 798fbaadd7..a87c85aded 100644
--- a/tests/quantizations_test.py
+++ b/tests/quantizations_test.py
@@ -33,6 +33,8 @@
 from aqt.jax.v2.flax import aqt_flax
 
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 from MaxText import pyconfig
 from MaxText.layers import nnx_wrappers, quantizations
 from MaxText import maxtext_utils
@@ -42,6 +44,7 @@
 
 _QUERY_REGEX = ".*/query"
 _VALUE_REGEX = ".*/value"
+MAXTEXT_PKG_DIR = os.path.join("src", MAXTEXT_PKG_DIR)
 
 
 class QuantTestModule(nnx.Module):
@@ -105,7 +108,7 @@ def __call__(self, inputs):
 
 def _configure_quantization(quant_str="", quant_cfg_path="", mode_str="train", replicate_scale=False):
   config = pyconfig.initialize(
-      [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+      [None, get_test_config_path()],
       enable_checkpointing=False,
       quantization=quant_str,
       quant_cfg_path=quant_cfg_path,
@@ -298,6 +301,8 @@ def setUp(self):
 
   def init_pyconfig(self, **kwargs):
     """Initialize MaxText pyconfig."""
+    # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
+    extra_args = {"ici_fsdp_parallelism": jax.device_count()} if is_decoupled() else {}
     init_kwargs = {
         "run_name": "test",
         "dataset_type": "synthetic",
@@ -312,9 +317,9 @@ def init_pyconfig(self, **kwargs):
         "base_num_kv_heads": 8,
         "base_mlp_dim": 4096,
         "base_num_decoder_layers": 12,
-    } | kwargs
+    } | kwargs | extra_args
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        [sys.argv[0], get_test_config_path()],
         **init_kwargs,
     )
     return config
@@ -419,10 +424,12 @@ def test_fp8_full_quantization(self):
     self.quantization_config("fp8_full")
 
   @pytest.mark.gpu_only
+  @pytest.mark.external_serving
   def test_fp8_gpu_quantization(self):
     self.quantization_config("fp8_gpu", grad_tolerance=1.0)
 
   @pytest.mark.gpu_only
+  @pytest.mark.external_serving
   def test_fp8_nanoo_quantization(self):
     self.quantization_config("fp8_nanoo", grad_tolerance=1.0)
 
diff --git a/tests/run_sharding_dump.py b/tests/run_sharding_dump.py
index 5d6067e063..4de0760b2d 100644
--- a/tests/run_sharding_dump.py
+++ b/tests/run_sharding_dump.py
@@ -18,6 +18,7 @@
 from typing import Sequence
 
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from tests.sharding_dump import TEST_CASES
 import os
 import subprocess
@@ -31,7 +32,7 @@ def run_single_dump(model_name: str, topology: str, num_slice: str) -> None:
           "python3",
           "-m",
           "tests.sharding_dump",
-          os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+          get_test_config_path(),
           f"compile_topology={topology}",
           f"compile_topology_num_slices={num_slice}",
           f"model_name={model_name}",
diff --git a/tests/sft_data_processing_test.py b/tests/sft_data_processing_test.py
index 84b28f459f..313380e335 100644
--- a/tests/sft_data_processing_test.py
+++ b/tests/sft_data_processing_test.py
@@ -17,6 +17,7 @@
 import subprocess
 import unittest
 import os.path
+import pytest
 
 import numpy as np
 
@@ -87,7 +88,7 @@
     ],
 ]
 
-
+@pytest.mark.external_training # Uses gsutil to pull tokenizer.
 class SFTDataProcessingTest(unittest.TestCase):
 
   @classmethod
diff --git a/tests/sft_hooks_test.py b/tests/sft_hooks_test.py
index d4ecd351e9..029698bf9b 100644
--- a/tests/sft_hooks_test.py
+++ b/tests/sft_hooks_test.py
@@ -22,12 +22,14 @@
 import numpy as np
 import os
 import unittest
+import pytest
 from unittest.mock import MagicMock, patch
 from jax.sharding import Mesh
 
 from MaxText import maxtext_utils
 from MaxText import pyconfig
 from MaxText.maxtext_utils import create_device_mesh
+from MaxText import gcloud_stub
 from MaxText.globals import MAXTEXT_PKG_DIR
 from MaxText.sft import hooks
 
@@ -43,7 +45,13 @@ def setUp(self):
         base_output_directory="test",
         skip_jax_distributed_system=True,
     )
-    self.mesh = Mesh(create_device_mesh(self.config), self.config.mesh_axes)
+    # Use a synthetic dataset for unit tests only when running in decoupled mode so
+    # tests remain self-contained and don't attempt remote access.
+    if gcloud_stub.is_decoupled():
+      self.config.dataset_type = "synthetic"
+
+    mesh_shape_1d = (len(jax.devices()),)
+    self.mesh = Mesh(mesh_utils.create_device_mesh(mesh_shape_1d), self.config.mesh_axes)
     learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(self.config)
 
     self.training_hooks = hooks.SFTTrainingHooks(self.config, self.mesh, learning_rate_schedule, goodput_recorder=None)
diff --git a/tests/sharding_compare_test.py b/tests/sharding_compare_test.py
index 9e7d198553..4274423e54 100644
--- a/tests/sharding_compare_test.py
+++ b/tests/sharding_compare_test.py
@@ -20,6 +20,7 @@
 import pytest
 
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.train_compile import get_shaped_inputs, get_topology_mesh, validate_config
 from MaxText import pyconfig
 
@@ -83,7 +84,7 @@ def test_sharding_dump_for_model(model_name: str, topology: str, num_slice: str)
   """Test if the sharding of new model implementation is as expected."""
   params = [
       "/deps/MaxText/tests/sharding_compare_test",
-      os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+      get_test_config_path(),
       f"compile_topology={topology}",
       f"compile_topology_num_slices={num_slice}",
       f"model_name={model_name}",
diff --git a/tests/simple_decoder_layer_test.py b/tests/simple_decoder_layer_test.py
index ceefca39e3..d1946a70ba 100644
--- a/tests/simple_decoder_layer_test.py
+++ b/tests/simple_decoder_layer_test.py
@@ -20,6 +20,7 @@
 
 from MaxText.train import main as train_main
 from MaxText.globals import MAXTEXT_PKG_DIR, MAXTEXT_ASSETS_ROOT
+from maxtext.tests.test_utils import get_test_config_path
 
 
 class SimpleDecoderLayerTest(unittest.TestCase):
@@ -29,7 +30,7 @@ def test_simple_decoder_layer(self):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             "base_output_directory=gs://runner-maxtext-logs",
             "run_name=runner_simple_decoder_layer_test",
             "dataset_path=gs://maxtext-dataset",
@@ -46,7 +47,7 @@ def test_mlp_decoder_layer(self):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             "base_output_directory=gs://runner-maxtext-logs",
             "run_name=runner_simple_decoder_layer_test",
             "dataset_path=gs://maxtext-dataset",
diff --git a/tests/state_dtypes_test.py b/tests/state_dtypes_test.py
index 9d0174d7ff..1053b05e63 100644
--- a/tests/state_dtypes_test.py
+++ b/tests/state_dtypes_test.py
@@ -27,6 +27,8 @@
 from MaxText.layers import quantizations
 from MaxText import maxtext_utils
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
+from MaxText.gcloud_stub import is_decoupled
 
 Transformer = models.transformer_as_linen
 
@@ -36,7 +38,10 @@ class StateDtypes(unittest.TestCase):
 
   def get_state(self, argv):
     """Gets model state including weights and optimizer state"""
-
+    # Conditionally set ici_fsdp_parallelism to match device count in decoupled mode
+    if is_decoupled():
+        argv = list(argv) + [f"ici_fsdp_parallelism={jax.device_count()}"]
+    
     # Setup necessary inputs to build a model state
     config = pyconfig.initialize(argv)
     quant = quantizations.configure_quantization(config)
@@ -60,14 +65,14 @@ def assert_pytree_is_dtype(self, weights, expected_dtype):
     jax.tree_util.tree_map_with_path(lambda x, y: self.assertEqual(y.dtype, expected_dtype), weights)
 
   def test_default_float32(self):
-    argv = [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"), "enable_checkpointing=False"]
+    argv = [None, get_test_config_path(), "enable_checkpointing=False"]
     weights = self.get_weights(argv)
     self.assert_pytree_is_dtype(weights, jnp.float32)
 
   def test_set_bf16(self):
     argv = [
         None,
-        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+        get_test_config_path(),
         "enable_checkpointing=False",
         "weight_dtype=bfloat16",
     ]
@@ -75,11 +80,11 @@ def test_set_bf16(self):
     self.assert_pytree_is_dtype(weights, jnp.bfloat16)
 
   def test_default_mu_float32(self):
-    argv = [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"), "enable_checkpointing=False"]
+    argv = [None, get_test_config_path(), "enable_checkpointing=False"]
     mu = self.get_mu(argv)
     self.assert_pytree_is_dtype(mu, jnp.float32)
 
   def test_set_mu_bf16(self):
-    argv = [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"), "enable_checkpointing=False", "mu_dtype=bfloat16"]
+    argv = [None, get_test_config_path(), "enable_checkpointing=False", "mu_dtype=bfloat16"]
     mu = self.get_mu(argv)
     self.assert_pytree_is_dtype(mu, jnp.bfloat16)
diff --git a/tests/test_env_smoke.py b/tests/test_env_smoke.py
new file mode 100644
index 0000000000..7cc1cd8dd7
--- /dev/null
+++ b/tests/test_env_smoke.py
@@ -0,0 +1,69 @@
+"""Pytest-based environment smoke test for MaxText (used esp for decoupling testing).
+
+Checks:
+  - Core imports (jax, flax, numpy)
+  - Optional imports
+  - JAX device enumeration
+
+Fails only on missing core imports or device query failure; alias test asserts mapping rules.
+"""
+from __future__ import annotations
+import os, time, importlib
+from MaxText.gcloud_stub import is_decoupled
+
+CORE_IMPORTS = ["jax", "jax.numpy", "flax", "numpy"]
+OPTIONAL_IMPORTS = ["transformers", "MaxText", "MaxText.pyconfig", "MaxText.maxengine"]
+
+_defects: list[str] = []
+
+
+def _import(name: str):
+    t0 = time.time()
+    try:
+        mod = importlib.import_module(name)
+        return name, mod, time.time() - t0, None
+    except Exception as e:  # pragma: no cover
+        return name, None, time.time() - t0, e
+
+
+def test_environment_core_imports():
+    results = [_import(n) for n in CORE_IMPORTS]
+    missing = [n for (n, m, _, err) in results if m is None]
+    if missing:
+        raise AssertionError(f"Missing core imports: {missing}")
+
+
+def test_environment_optional_imports():
+    results = [_import(n) for n in OPTIONAL_IMPORTS]
+    for (n, m, dt, err) in results:
+        if err is not None:
+            _defects.append(f"{n} FAIL: {err}")
+        else:
+            if dt > 8.0:
+                _defects.append(f"{n} SLOW_IMPORT ({dt:.1f}s)")
+
+
+def test_jax_devices():
+    try:
+        import jax  # type: ignore
+    except Exception as e:  # pragma: no cover
+        raise AssertionError(f"jax not importable for device test: {e}")
+    try:
+        devices = jax.devices()
+    except Exception as e:  # pragma: no cover
+        raise AssertionError(f"jax.devices() failed: {e}")
+    assert len(devices) >= 1, "No JAX devices found"
+
+
+def test_decoupled_flag_consistency():
+    decoupled = is_decoupled()
+    # Soft check only; logic exercised in other tests.
+    if decoupled:
+        pass
+    else:
+        pass
+
+
+def test_report_defects():
+    if _defects:
+        print("Environment optional issues:\n" + "\n".join(_defects))
diff --git a/tests/tfds_data_processing_test.py b/tests/tfds_data_processing_test.py
index f3f515e567..71914bf368 100644
--- a/tests/tfds_data_processing_test.py
+++ b/tests/tfds_data_processing_test.py
@@ -21,31 +21,50 @@
 from jax.sharding import Mesh
 from jax.experimental import mesh_utils
 
+from MaxText.gcloud_stub import is_decoupled
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_ASSETS_ROOT, MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 from MaxText.input_pipeline import _tfds_data_processing
 from MaxText.input_pipeline import input_pipeline_interface
 
+MAXTEXT_ASSETS_ROOT = os.path.join("src", MAXTEXT_PKG_DIR, "assets")
+
 
 class TfdsDataProcessingTest(unittest.TestCase):
 
   def setUp(self):
     super().setUp()
+    decoupled = is_decoupled()
+    if decoupled:
+      local_dataset_name = "c4/en:3.1.0"
+      _dataset_path = os.path.join("local_datasets", "c4_en_dataset_minimal")
+      _base_output_directory = os.path.join("local_datasets", "gcloud_decoupled_test_logs")
+    else:
+      local_dataset_name = None
+      _dataset_path = "gs://maxtext-dataset"
+      _base_output_directory = "gs://max-experiments/"
+    config_kwargs = dict(
+      per_device_batch_size=1,
+      run_name="test",
+      mesh_axes=["data"],
+      logical_axis_rules=[["batch", "data"]],
+      data_sharding=["data"],
+      base_output_directory=_base_output_directory,
+      dataset_path=_dataset_path,
+      tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer"),
+      enable_checkpointing=False,
+      eval_interval=10,
+    )
+
+    if decoupled and local_dataset_name:
+      config_kwargs["dataset_name"] = local_dataset_name
     config = pyconfig.initialize(
-        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
-        per_device_batch_size=1,
-        run_name="test",
-        mesh_axes=["data"],
-        logical_axis_rules=[["batch", "data"]],
-        data_sharding=["data"],
-        base_output_directory="gs://max-experiments/",
-        dataset_path="gs://maxtext-dataset/",
-        tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer"),
-        enable_checkpointing=False,
-        eval_interval=10,
+      [sys.argv[0], get_test_config_path()],
+      **config_kwargs
     )
     os.environ["TFDS_DATA_DIR"] = config.dataset_path
     self.config = config
diff --git a/tests/tiling_test.py b/tests/tiling_test.py
index 9d462509cf..aac9e7804a 100644
--- a/tests/tiling_test.py
+++ b/tests/tiling_test.py
@@ -24,6 +24,7 @@
 import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh
+from maxtext.tests.test_utils import get_test_config_path
 from flax import linen as nn
 
 from MaxText import maxtext_utils
@@ -66,7 +67,7 @@ def setUp(self):
     """
     Set up common configurations and dummy data for the tests.
     """
-    self.base_config = [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")]
+    self.base_config = [None, get_test_config_path()]
     self.rng = jax.random.PRNGKey(1234)
     self.batch_size = 1
     self.seq_len = 64
diff --git a/tests/train_compile_test.py b/tests/train_compile_test.py
index 8f5ddcda85..61faf61bd4 100644
--- a/tests/train_compile_test.py
+++ b/tests/train_compile_test.py
@@ -27,7 +27,9 @@
 
 from MaxText.train_compile import main as train_compile_main
 from MaxText.globals import MAXTEXT_PKG_DIR
+from maxtext.tests.test_utils import get_test_config_path
 
+pytestmark = [pytest.mark.tpu_only]
 
 class TrainCompile(unittest.TestCase):
   """Tests for the Ahead of Time Compilation functionality, train_compile.py"""
@@ -39,7 +41,7 @@ def test_save_compiled_v4(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v4-8",
             "compile_topology_num_slices=1",
@@ -56,7 +58,7 @@ def test_save_compiled_v5e(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5e-16",
             "compile_topology_num_slices=1",
@@ -75,7 +77,7 @@ def test_minimal_offloaded_v5e(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5e-256",
             "compile_topology_num_slices=1",
@@ -97,7 +99,7 @@ def test_save_flash(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-256",
             "compile_topology_num_slices=1",
@@ -114,7 +116,7 @@ def test_save_compiled_v5p_two_slices(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-8",
             "compile_topology_num_slices=2",
@@ -133,7 +135,7 @@ def test_save_compiled_v6e(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-16",
             "compile_topology_num_slices=1",
@@ -150,7 +152,7 @@ def test_sequence_parallelism(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5e-256",
             "use_iota_embed=true",
@@ -169,7 +171,7 @@ def test_remat_save_dot_except_mlpwi(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5e-256",
             "compile_topology_num_slices=1",
@@ -192,7 +194,7 @@ def test_remat_save_dot_except_mlp(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5e-256",
             "compile_topology_num_slices=1",
@@ -215,7 +217,7 @@ def test_remat_save_qkv_proj(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5e-256",
             "compile_topology_num_slices=1",
@@ -238,7 +240,7 @@ def test_remat_full(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5e-256",
             "compile_topology_num_slices=1",
@@ -261,7 +263,7 @@ def test_custom_64x4_mesh(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "use_iota_embed=true",
@@ -284,7 +286,7 @@ def test_llama3_1_70b_opt_offload(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "compile_topology_num_slices=1",
@@ -303,7 +305,7 @@ def test_custom_32x8_mesh(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "use_iota_embed=true",
@@ -328,7 +330,7 @@ def test_moe_dropping_bf16(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "use_iota_embed=true",
@@ -351,7 +353,7 @@ def test_moe_dropping_int8(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-128",
             "use_iota_embed=true",
@@ -375,7 +377,7 @@ def test_moe_megablox_bf16(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "use_iota_embed=true",
@@ -397,7 +399,7 @@ def test_moe_ragged_dot_bf16(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "use_iota_embed=true",
@@ -419,7 +421,7 @@ def test_moe_dense_bf16(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "use_iota_embed=true",
@@ -442,7 +444,7 @@ def test_moe_dense_int8(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-128",
             "use_iota_embed=true",
@@ -465,7 +467,7 @@ def test_moe_pp_bf16(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "use_iota_embed=true",
@@ -489,7 +491,7 @@ def test_moe_deepseek_scanned_bf16(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-256",
             "use_iota_embed=true",
@@ -514,7 +516,7 @@ def test_moe_deepseek_unscanned_bf16(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-256",
             "use_iota_embed=true",
@@ -537,7 +539,7 @@ def test_moe_deepseek_with_device_limit(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-256",
             "use_iota_embed=true",
@@ -561,7 +563,7 @@ def test_moe_deepseek_without_device_limit(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-256",
             "use_iota_embed=true",
@@ -585,7 +587,7 @@ def test_moe_deepseek_pipeline_subset(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "compile_topology_num_slices=8",
@@ -608,7 +610,7 @@ def test_pipeline_subset(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v6e-256",
             "compile_topology_num_slices=8",
@@ -628,7 +630,7 @@ def test_moe_llama4_17b_16e(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-256",
             "compile_topology_num_slices=1",
@@ -649,7 +651,7 @@ def test_moe_gpt_oss_20b_sparse_matmul(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-64",
             "compile_topology_num_slices=1",
@@ -671,7 +673,7 @@ def test_moe_gpt_oss_20b_dense_matmul(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-64",
             "compile_topology_num_slices=1",
@@ -693,7 +695,7 @@ def test_gpt3_6b(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-256",
             "compile_topology_num_slices=1",
@@ -709,7 +711,7 @@ def test_qwen3_qk_norm(self):
     train_compile_main(
         (
             "",
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
             "compile_topology=v5p-8",
             "compile_topology_num_slices=1",
diff --git a/tests/train_gpu_smoke_test.py b/tests/train_gpu_smoke_test.py
index 80d1710770..538346a55f 100644
--- a/tests/train_gpu_smoke_test.py
+++ b/tests/train_gpu_smoke_test.py
@@ -15,7 +15,7 @@
 """ Smoke test """
 import os
 import unittest
-
+from MaxText.gcloud_stub import is_decoupled
 from absl.testing import absltest
 
 from MaxText.train import main as train_main
@@ -27,14 +27,27 @@ class Train(unittest.TestCase):
 
   def test_tiny_config(self):
     test_tmpdir = os.environ.get("TEST_TMPDIR")  # pylint: disable=unused-variable
+    decoupled = is_decoupled()
+    # Use local minimal dataset if decoupled, otherwise default gs:// path.
+    dataset_path = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal") if decoupled else "gs://maxtext-dataset"
+    )
+    base_output_directory = (
+        os.environ.get(
+            "LOCAL_BASE_OUTPUT",
+            os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs"),
+        )
+        if decoupled
+        else "gs://runner-maxtext-logs"
+    )
     train_main(
         [
             None,
             os.path.join(MAXTEXT_PKG_DIR, "configs", "gpu_smoke_test.yml"),
             # pylint: disable=f-string-without-interpolation
-            f"base_output_directory=gs://runner-maxtext-logs",
+            f"base_output_directory={base_output_directory}",
             "run_name=runner_test",
-            r"dataset_path=gs://maxtext-dataset",
+            f"dataset_path={dataset_path}",
             "enable_checkpointing=False",
             rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
             "enable_goodput_recording=False",
diff --git a/tests/train_int8_smoke_test.py b/tests/train_int8_smoke_test.py
index dedf9d27c0..99a33a6c40 100644
--- a/tests/train_int8_smoke_test.py
+++ b/tests/train_int8_smoke_test.py
@@ -14,6 +14,7 @@
 
 """Smoke test for int8"""
 import os
+from MaxText.gcloud_stub import is_decoupled
 import unittest
 
 from absl.testing import absltest
@@ -27,14 +28,26 @@ class Train(unittest.TestCase):
 
   def test_tiny_config(self):
     test_tmpdir = os.environ.get("TEST_TMPDIR")  # pylint: disable=unused-variable
+    decoupled = is_decoupled()
+    dataset_path = (
+        os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "c4_en_dataset_minimal") if decoupled else "gs://maxtext-dataset"
+    )
+    base_output_directory = (
+        os.environ.get(
+            "LOCAL_BASE_OUTPUT",
+            os.path.join(MAXTEXT_PKG_DIR, "..", "local_datasets", "gcloud_decoupled_test_logs"),
+        )
+        if decoupled
+        else "gs://runner-maxtext-logs"
+    )
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             # pylint: disable=f-string-without-interpolation
-            f"base_output_directory=gs://runner-maxtext-logs",
+            f"base_output_directory={base_output_directory}",
             "run_name=runner_test",
-            r"dataset_path=gs://maxtext-dataset",
+            f"dataset_path={dataset_path}",
             "base_emb_dim=8",
             "base_num_query_heads=4",
             "base_num_kv_heads=4",
diff --git a/tests/train_smoke_test.py b/tests/train_smoke_test.py
index b839232e60..c39d209432 100644
--- a/tests/train_smoke_test.py
+++ b/tests/train_smoke_test.py
@@ -30,7 +30,7 @@ def test_tiny_config(self):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             # pylint: disable=f-string-without-interpolation
             f"base_output_directory=gs://runner-maxtext-logs",
             "run_name=runner_test",
@@ -87,7 +87,7 @@ def test_tiny_config_explicit_shardmode(self):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             # pylint: disable=f-string-without-interpolation
             f"base_output_directory=gs://runner-maxtext-logs",
             "run_name=runner_test",
diff --git a/tests/train_using_ragged_dot_smoke_test.py b/tests/train_using_ragged_dot_smoke_test.py
index 2d368cf0d9..1213a1f3e6 100644
--- a/tests/train_using_ragged_dot_smoke_test.py
+++ b/tests/train_using_ragged_dot_smoke_test.py
@@ -40,7 +40,7 @@ def test_tiny_config(self, quantization: str):
     train_main(
         [
             None,
-            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            get_test_config_path(),
             f"base_output_directory={test_tmpdir}",
             "run_name=ragged_dot_smoke_test",
             "base_emb_dim=128",