From 22ed086416068bcc28556f2bba4ef2934d6344b3 Mon Sep 17 00:00:00 2001
From: CharlesCNorton <machineelv@gmail.com>
Date: Tue, 13 Jan 2026 05:46:04 -0500
Subject: [PATCH] Fix Windows safetensors load order to prevent crash

On Windows, loading large safetensors files (41GB BF16 model) crashes
with a segmentation fault when Gemma is already loaded in memory due
to a safetensors memory mapping issue.

Changes:
- Reorder SingleGPUModelBuilder.build() to load state dict BEFORE
  applying module_ops (which load Gemma)
- Make triton import conditional since it's Linux-only, with fallback
  for FP8 weight fusion on non-Linux systems
---
 packages/ltx-core/src/ltx_core/loader/fuse_loras.py    | 10 +++++++---
 .../src/ltx_core/loader/single_gpu_model_builder.py    |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/packages/ltx-core/src/ltx_core/loader/fuse_loras.py b/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
index 66269dd..4a80db3 100644
--- a/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
+++ b/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
@@ -1,7 +1,11 @@
 import torch
-import triton
 
-from ltx_core.loader.kernels import fused_add_round_kernel
+try:
+    import triton
+    from ltx_core.loader.kernels import fused_add_round_kernel
+    TRITON_AVAILABLE = True
+except ImportError:
+    TRITON_AVAILABLE = False
 from ltx_core.loader.primitives import LoraStateDictWithStrength, StateDict
 
 BLOCK_SIZE = 1024
@@ -84,7 +88,7 @@ def apply_loras(
                 continue
             deltas = weight.clone().to(dtype=target_dtype, device=device)
         elif weight.dtype == torch.float8_e4m3fn:
-            if str(device).startswith("cuda"):
+            if TRITON_AVAILABLE and str(device).startswith("cuda"):
                 deltas = calculate_weight_float8_(deltas, weight)
             else:
                 deltas.add_(weight.to(dtype=deltas.dtype, device=device))
diff --git a/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py b/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
index 9e8853a..a6394f2 100644
--- a/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
+++ b/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
@@ -72,9 +72,9 @@ def _return_model(self, meta_model: ModelType, device: torch.device) -> ModelTyp
     def build(self, device: torch.device | None = None, dtype: torch.dtype | None = None) -> ModelType:
         device = torch.device("cuda") if device is None else device
         config = self.model_config()
-        meta_model = self.meta_model(config, self.module_ops)
         model_paths = self.model_path if isinstance(self.model_path, tuple) else [self.model_path]
         model_state_dict = self.load_sd(model_paths, sd_ops=self.model_sd_ops, registry=self.registry, device=device)
+        meta_model = self.meta_model(config, self.module_ops)
 
         lora_strengths = [lora.strength for lora in self.loras]
         if not lora_strengths or (min(lora_strengths) == 0 and max(lora_strengths) == 0):