ModelTC · gushiqiao · Apr 28, 2026 · Apr 23, 2026 · gemini-code-assist · Apr 23, 2026
diff --git a/configs/flux2/flux2_dev_offload.json b/configs/flux2/flux2_dev_offload.json
@@ -0,0 +1,15 @@
+{
+    "model_cls": "flux2_dev",
+    "task": "t2i",
+    "infer_steps": 50,
+    "sample_guide_scale": 4.0,
+    "vae_scale_factor": 16,
+    "feature_caching": "None",
+    "enable_cfg": false,
+    "patch_size": 2,
+    "tokenizer_max_length": 512,
+    "rope_type": "flashinfer",
+    "text_encoder_out_layers": [10, 20, 30],
+    "cpu_offload": true,
+    "offload_granularity": "block"
+}
diff --git a/lightx2v/common/ops/utils.py b/lightx2v/common/ops/utils.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 
 import torch
+from loguru import logger
 from safetensors import safe_open
 
 from lightx2v.utils.envs import *
@@ -81,10 +82,15 @@ def create_pin_tensor(tensor, transpose=False, dtype=None):
         dtype: Target data type of the pinned tensor (optional, defaults to source tensor's dtype)
 
     Returns:
-        Pinned memory tensor (on CPU) with optional transposition applied
+        Pinned memory tensor (on CPU) with optional transposition applied.
+        Falls back to regular CPU tensor if pinned memory allocation fails.
     """
     dtype = dtype or tensor.dtype
-    pin_tensor = torch.empty(tensor.shape, pin_memory=True, dtype=dtype)
+    try:
+        pin_tensor = torch.empty(tensor.shape, pin_memory=True, dtype=dtype)
+    except Exception as e:
-    except Exception as e:
+    except RuntimeError as e:
-    except Exception as e:
+    except RuntimeError as e:
+        logger.warning(f"Failed to allocate pinned memory (shape={tensor.shape}, dtype={dtype}): {e}. Falling back to regular CPU memory.")
+        pin_tensor = torch.empty(tensor.shape, dtype=dtype)
     pin_tensor = pin_tensor.copy_(tensor)
     if transpose:
         pin_tensor = pin_tensor.t()

diff --git a/lightx2v/models/networks/flux2/weights/transformer_weights.py b/lightx2v/models/networks/flux2/weights/transformer_weights.py
@@ -238,12 +238,18 @@ def to_cuda(self, non_blocking=True):
             block.to_cuda(non_blocking=non_blocking)
         for block in self.single_blocks:
             block.to_cuda(non_blocking=non_blocking)
+        self.double_stream_modulation_img_linear.to_cuda(non_blocking=non_blocking)
+        self.double_stream_modulation_txt_linear.to_cuda(non_blocking=non_blocking)
+        self.single_stream_modulation_linear.to_cuda(non_blocking=non_blocking)
 
     def to_cpu(self, non_blocking=True):
         for block in self.double_blocks:
             block.to_cpu(non_blocking=non_blocking)
         for block in self.single_blocks:
             block.to_cpu(non_blocking=non_blocking)
+        self.double_stream_modulation_img_linear.to_cpu(non_blocking=non_blocking)
+        self.double_stream_modulation_txt_linear.to_cpu(non_blocking=non_blocking)
+        self.single_stream_modulation_linear.to_cpu(non_blocking=non_blocking)
 
 
 # Backward-compatible aliases

diff --git a/lightx2v/models/runners/flux2/flux2_runner.py b/lightx2v/models/runners/flux2/flux2_runner.py
@@ -13,6 +13,8 @@
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v_platform.base.global_var import AI_DEVICE
 
+torch_device_module = getattr(torch, AI_DEVICE)
+
 
 def calculate_dimensions(target_area, ratio):
     width = math.sqrt(target_area * ratio)
@@ -45,8 +47,11 @@ def load_vae(self):
 
     def init_modules(self):
         logger.info(f"Initializing {self.config['model_cls']} modules...")
-        self.load_model()
-        self.model.set_scheduler(self.scheduler)
+        if not self.config.get("lazy_load", False) and not self.config.get("unload_modules", False):
+            self.load_model()
+            self.model.set_scheduler(self.scheduler)
+        elif self.config.get("lazy_load", False):
+            assert self.config.get("cpu_offload", False)
-            assert self.config.get("cpu_offload", False)
+            if not self.config.get("cpu_offload", False):
+                raise ValueError("cpu_offload must be enabled when lazy_load is true")
-            assert self.config.get("cpu_offload", False)
+            if not self.config.get("cpu_offload", False):
+                raise ValueError("cpu_offload must be enabled when lazy_load is true")
 
         task = self.config.get("task", "t2i")
         if task == "i2i":
@@ -59,8 +64,12 @@ def init_modules(self):
     @ProfilingContext4DebugL2("Run Encoders")
     def _run_input_encoder_local_t2i(self):
         prompt = self.input_info.prompt
+        if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
+            self.text_encoders = self.load_text_encoder()
         text_encoder_output = self.run_text_encoder(prompt, neg_prompt=self.input_info.negative_prompt)
-        torch.cuda.empty_cache()
+        if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
+            del self.text_encoders[0]
-            del self.text_encoders[0]
+            del self.text_encoders
-            del self.text_encoders[0]
+            del self.text_encoders
+        torch_device_module.empty_cache()
         gc.collect()
         return {
             "text_encoder_output": text_encoder_output,
@@ -70,7 +79,11 @@ def _run_input_encoder_local_t2i(self):
     @ProfilingContext4DebugL2("Run Encoders I2I")
     def _run_input_encoder_local_i2i(self):
         prompt = self.input_info.prompt
+        if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
+            self.text_encoders = self.load_text_encoder()
         text_encoder_output = self.run_text_encoder(prompt, neg_prompt=self.input_info.negative_prompt)
+        if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
+            del self.text_encoders[0]
-            del self.text_encoders[0]
+            del self.text_encoders
-            del self.text_encoders[0]
+            del self.text_encoders
 
         image_path = self.input_info.image_path
         from PIL import Image
@@ -108,7 +121,7 @@ def _run_input_encoder_local_i2i(self):
             if index == 0:
                 self.input_info.target_shape = (image_height, image_width)
 
-        torch.cuda.empty_cache()
+        torch_device_module.empty_cache()
         gc.collect()
 
         return {
@@ -244,6 +257,9 @@ def set_img_shapes(self):
 
     @ProfilingContext4DebugL1("Run VAE Decoder")
     def run_vae_decoder(self, latents):
+        if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
+            self.vae = self.load_vae()
+
         B, _, C = latents.shape
 
         H = int((self.input_info.latent_image_ids[0, :, 1].max() + 1).item())
@@ -252,14 +268,20 @@ def run_vae_decoder(self, latents):
         latents = latents.view(B, H, W, C).permute(0, 3, 1, 2)
 
         bn_mean = self.vae.vae.bn.running_mean.view(1, -1, 1, 1).to(latents.device, latents.dtype)
-        bn_std = torch.sqrt(self.vae.vae.bn.running_var.view(1, -1, 1, 1) + self.vae.vae.config.batch_norm_eps)
+        bn_std = torch.sqrt(self.vae.vae.bn.running_var.view(1, -1, 1, 1) + self.vae.vae.config.batch_norm_eps).to(latents.device, latents.dtype)
         latents = latents * bn_std + bn_mean
 
         latents = latents.reshape(B, C // 4, 2, 2, H, W)
         latents = latents.permute(0, 1, 4, 2, 5, 3)
         latents = latents.reshape(B, C // 4, H * 2, W * 2)
 
         images = self.vae.decode(latents, self.input_info)
+
+        if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
+            del self.vae
+            torch_device_module.empty_cache()
+            gc.collect()
+
         return images
 
     @ProfilingContext4DebugL1("RUN pipeline")
@@ -279,7 +301,7 @@ def run_pipeline(self, input_info):
             image.save(input_info.save_result_path)
             logger.info(f"Image saved: {input_info.save_result_path}")
 
-        torch.cuda.empty_cache()
+        torch_device_module.empty_cache()
         gc.collect()
 
         if input_info.return_result_tensor:

diff --git a/scripts/flux2/infer_flux2_dev_offload.sh b/scripts/flux2/infer_flux2_dev_offload.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+lightx2v_path=
+model_path="/data/temp/FLUX.2-dev"
+export CUDA_VISIBLE_DEVICES=3
+
+source ${lightx2v_path}/scripts/base/base.sh
+
+# Create output directory
+mkdir -p ${lightx2v_path}/save_results
+
+python -m lightx2v.infer \
+    --model_cls flux2_dev \
+    --task t2i \
+    --model_path $model_path \
+    --config_json "${lightx2v_path}/configs/flux2/flux2_dev_offload.json" \
+    --prompt "Realistic macro photograph of a hermit crab using a soda can as its shell, partially emerging from the can, captured with sharp detail and natural colors, on a sunlit beach with soft shadows and a shallow depth of field, with blurred ocean waves in the background. The can has the text 'BFL Diffusers' on it and it has a color gradient that start with #FF5733 at the top and transitions to #33FF57 at the bottom." \
+    --save_result_path "${lightx2v_path}/save_results/flux2_dev_offload.png" \
+    --seed 42