llama context parallelism

oliverkinch · oliverkinch · commit 9ceae8a02fc4 · 2025-11-10T12:48:14.000+01:00
diff --git a/maester/models/llama/model.py b/maester/models/llama/model.py
@@ -20,6 +20,8 @@
 from maester.models.norms import create_norm
 from maester.models.llama.tied_linear import TiedLinear
 
+from torch.distributed.device_mesh import DeviceMesh
+
 
 @dataclass
 class ModelArgs:
@@ -457,12 +459,13 @@ def forward(
             return output
 
     @classmethod
-    def from_model_args(cls, model_args: ModelArgs) -> "Transformer":
+    def from_model_args(cls, model_args: ModelArgs, cp_device_mesh: Optional[DeviceMesh] = None) -> "Transformer":
         """
         Initialize a Transformer model from a ModelArgs object.
 
         Args:
             model_args (ModelArgs): Model configuration arguments.
+            cp_device_mesh (Optional[DeviceMesh]): Device mesh for context parallelism.
 
         Returns:
             Transformer: Transformer model.
diff --git a/maester/parallelisms/parallelize_llama.py b/maester/parallelisms/parallelize_llama.py
@@ -60,14 +60,21 @@ def parallelize_llama(
                 "fused_rmsnorm is not compatible with torch.compile yet. "
                 "Please use rmsnorm or layernorm."
             )
-        apply_compile(model)
+        apply_compile(model, fullgraph=not parallel_dims.cp_enabled)
 
-    if parallel_dims.dp_shard_enabled:
+    if parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled:
         if parallel_dims.dp_replicate_enabled:
-            dp_mesh = world_mesh["dp_replicate", "dp_shard"]
+            if parallel_dims.cp_enabled:
+                dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+            else:
+                dp_mesh_dim_names = ("dp_replicate", "dp_shard")
         else:
-            dp_mesh = world_mesh["dp"]
+            if parallel_dims.cp_enabled:
+                dp_mesh_dim_names = ("dp_shard_cp",)
+            else:
+                dp_mesh_dim_names = ("dp",)
         
+        dp_mesh = world_mesh[tuple(dp_mesh_dim_names)]
         apply_fsdp(model, dp_mesh, param_dtype=TORCH_DTYPE_MAP[config.mixed_precision_param], 
                    reduce_dtype=TORCH_DTYPE_MAP[config.mixed_precision_reduce])
         if parallel_dims.dp_replicate_enabled:
@@ -239,16 +246,16 @@ def apply_ac(model: nn.Module, ac_config: Config):
     logger.info(f"Applied {ac_config.ac_mode} activation checkpointing to the model")
 
 
-def apply_compile(model: nn.Module):
+def apply_compile(model: nn.Module, fullgraph: bool = True):
     """
     Apply torch.compile to each TransformerBlock, which makes compilation efficient due to
     repeated structure. Alternatively one can compile the whole model (after applying DP).
     """
     for layer_id, transformer_block in model.layers.named_children():
-        transformer_block = torch.compile(transformer_block, fullgraph=True)
+        transformer_block = torch.compile(transformer_block, fullgraph=fullgraph)
         model.layers.register_module(layer_id, transformer_block)
 
-    logger.info("Compiling each TransformerBlock with torch.compile")
+    logger.info(f"Compiling each TransformerBlock with torch.compile (fullgraph={fullgraph})")
 
 
 def apply_fsdp(