update

a-r-r-o-w · a-r-r-o-w · commit 7ae44420c31c · 2025-07-28T07:20:46.000+02:00
diff --git a/src/diffusers/modular_pipelines/wan/__init__.py b/src/diffusers/modular_pipelines/wan/__init__.py
@@ -25,12 +25,14 @@
     _import_structure["modular_blocks"] = [
         "ALL_BLOCKS",
         "AUTO_BLOCKS",
+        "IMAGE2VIDEO_BLOCKS",
         "TEXT2VIDEO_BLOCKS",
         "WanAutoBeforeDenoiseStep",
         "WanAutoBlocks",
         "WanAutoBlocks",
         "WanAutoDecodeStep",
         "WanAutoDenoiseStep",
+        "WanAutoVaeEncoderStep",
     ]
     _import_structure["modular_pipeline"] = ["WanModularPipeline"]
 
@@ -45,11 +47,13 @@
         from .modular_blocks import (
             ALL_BLOCKS,
             AUTO_BLOCKS,
+            IMAGE2VIDEO_BLOCKS,
             TEXT2VIDEO_BLOCKS,
             WanAutoBeforeDenoiseStep,
             WanAutoBlocks,
             WanAutoDecodeStep,
             WanAutoDenoiseStep,
+            WanAutoVaeEncoderStep,
         )
         from .modular_pipeline import WanModularPipeline
 else:
diff --git a/src/diffusers/modular_pipelines/wan/before_denoise.py b/src/diffusers/modular_pipelines/wan/before_denoise.py
@@ -282,7 +282,10 @@ def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
                 "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
-            )
+            ),
+            OutputParam("height", type_hint=int),
+            OutputParam("width", type_hint=int),
+            OutputParam("num_frames", type_hint=int),
         ]
 
     @staticmethod
diff --git a/src/diffusers/modular_pipelines/wan/denoise.py b/src/diffusers/modular_pipelines/wan/denoise.py
@@ -34,6 +34,56 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+class WanI2VLoopBeforeDenoiser(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", UniPCMultistepScheduler),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that prepares the latent input for the denoiser. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanI2VDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process.",
+            ),
+            InputParam(
+                "latent_condition",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latent condition to use for the denoising process.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "concatenated_latents",
+                type_hint=torch.Tensor,
+                description="The concatenated noisy and conditioning latents to use for the denoising process.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: WanModularPipeline, block_state: BlockState, i: int, t: int):
+        block_state.concatenated_latents = torch.cat([block_state.latents, block_state.latent_condition], dim=1)
+        return components, block_state
+
+
 class WanLoopDenoiser(PipelineBlock):
     model_name = "wan"
 
@@ -102,7 +152,7 @@ def __call__(
         components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
 
         # Prepare mini‐batches according to guidance method and `guider_input_fields`
-        # Each guider_state_batch will have .prompt_embeds, .time_ids, text_embeds, image_embeds.
+        # Each guider_state_batch will have .prompt_embeds.
         # e.g. for CFG, we prepare two batches: one for uncond, one for cond
         # for first batch, guider_state_batch.prompt_embeds correspond to block_state.prompt_embeds
         # for second batch, guider_state_batch.prompt_embeds correspond to block_state.negative_prompt_embeds
@@ -120,7 +170,112 @@ def __call__(
             guider_state_batch.noise_pred = components.transformer(
                 hidden_states=block_state.latents.to(transformer_dtype),
                 timestep=t.flatten(),
-                encoder_hidden_states=prompt_embeds,
+                encoder_hidden_states=prompt_embeds.to(transformer_dtype),
+                attention_kwargs=block_state.attention_kwargs,
+                return_dict=False,
+            )[0]
+            components.guider.cleanup_models(components.transformer)
+
+        # Perform guidance
+        block_state.noise_pred, block_state.scheduler_step_kwargs = components.guider(guider_state)
+
+        return components, block_state
+
+
+class WanI2VLoopDenoiser(PipelineBlock):
+    model_name = "wan"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 5.0}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", WanTransformer3DModel),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoise the latents with guidance. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("attention_kwargs"),
+        ]
+
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "concatenated_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process.",
+            ),
+            InputParam(
+                "encoder_hidden_states_image",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The encoder hidden states for the image inputs.",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process.",
+            ),
+            InputParam(
+                kwargs_type="guider_input_fields",
+                description=(
+                    "All conditional model inputs that need to be prepared with guider. "
+                    "It should contain prompt_embeds/negative_prompt_embeds. "
+                    "Please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+                ),
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(
+        self, components: WanModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        #  Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds)
+        #  to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds)
+        guider_input_fields = {
+            "prompt_embeds": ("prompt_embeds", "negative_prompt_embeds"),
+        }
+        transformer_dtype = components.transformer.dtype
+
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+
+        # Prepare mini‐batches according to guidance method and `guider_input_fields`
+        # Each guider_state_batch will have .prompt_embeds.
+        # e.g. for CFG, we prepare two batches: one for uncond, one for cond
+        # for first batch, guider_state_batch.prompt_embeds correspond to block_state.prompt_embeds
+        # for second batch, guider_state_batch.prompt_embeds correspond to block_state.negative_prompt_embeds
+        guider_state = components.guider.prepare_inputs(block_state, guider_input_fields)
+
+        # run the denoiser for each guidance batch
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+            cond_kwargs = guider_state_batch.as_dict()
+            cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
+            prompt_embeds = cond_kwargs.pop("prompt_embeds")
+
+            # Predict the noise residual
+            # store the noise_pred in guider_state_batch so that we can apply guidance across all batches
+            guider_state_batch.noise_pred = components.transformer(
+                hidden_states=block_state.concatenated_latents.to(transformer_dtype),
+                timestep=t.flatten(),
+                encoder_hidden_states=prompt_embeds.to(transformer_dtype),
+                encoder_hidden_states_image=block_state.encoder_hidden_states_image.to(transformer_dtype),
                 attention_kwargs=block_state.attention_kwargs,
                 return_dict=False,
             )[0]
@@ -247,7 +402,7 @@ class WanDenoiseStep(WanDenoiseLoopWrapper):
         WanLoopDenoiser,
         WanLoopAfterDenoiser,
     ]
-    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+    block_names = ["denoiser", "after_denoiser"]
 
     @property
     def description(self) -> str:
@@ -257,5 +412,26 @@ def description(self) -> str:
             "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
             " - `WanLoopDenoiser`\n"
             " - `WanLoopAfterDenoiser`\n"
-            "This block supports both text2vid tasks."
+            "This block supports the text2vid task."
+        )
+
+
+class WanI2VDenoiseStep(WanDenoiseLoopWrapper):
+    block_classes = [
+        WanI2VLoopBeforeDenoiser,
+        WanI2VLoopDenoiser,
+        WanLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoises the latents with conditional first- and last-frame support. \n"
+            "Its loop logic is defined in `WanDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
+            " - `WanI2VLoopBeforeDenoiser`\n"
+            " - `WanI2VLoopDenoiser`\n"
+            " - `WanI2VLoopAfterDenoiser`\n"
+            "This block supports the image-to-video and first-last-frame-to-video tasks."
         )
diff --git a/src/diffusers/modular_pipelines/wan/encoders.py b/src/diffusers/modular_pipelines/wan/encoders.py
@@ -259,7 +259,7 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
         return components, state
 
 
-class WanImageEncodeStep(PipelineBlock):
+class WanImageEncoderStep(PipelineBlock):
     model_name = "wan"
 
     @property
@@ -368,15 +368,15 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam("image", required=True),
             InputParam("last_image", required=False),
-            InputParam("height", type_hint=int),
-            InputParam("width", type_hint=int),
-            InputParam("num_frames", type_hint=int),
         ]
 
     @property
     def intermediate_inputs(self) -> List[InputParam]:
         return [
-            InputParam("num_channels_latents", type_hint=int),
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("num_frames", type_hint=int),
+            InputParam("batch_size", type_hint=int),
             InputParam("generator"),
             InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
         ]
@@ -388,11 +388,12 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 "latent_condition",
                 type_hint=torch.Tensor,
                 description="The latents representing the reference first-frame/last-frame for conditioned video generation.",
-            )
+            ),
+            OutputParam("num_channels_latents", type_hint=int),
         ]
 
+    @staticmethod
     def _encode_vae_image(
-        self,
         components: WanModularPipeline,
         batch_size: int,
         height: int,
@@ -404,11 +405,13 @@ def _encode_vae_image(
         last_image: Optional[torch.Tensor] = None,
         generator: Optional[torch.Generator] = None,
     ):
-        latent_height = height // self.vae_scale_factor_spatial
-        latent_width = width // self.vae_scale_factor_spatial
+        latent_height = height // components.vae_scale_factor_spatial
+        latent_width = width // components.vae_scale_factor_spatial
 
         latents_mean = (
-            torch.tensor(components.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).to(device, dtype)
+            torch.tensor(components.vae.config.latents_mean)
+            .view(1, components.vae.config.z_dim, 1, 1, 1)
+            .to(device, dtype)
         )
         latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
             1, components.vae.config.z_dim, 1, 1, 1
@@ -429,11 +432,11 @@ def _encode_vae_image(
 
         if isinstance(generator, list):
             latent_condition = [
-                retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax") for _ in generator
+                retrieve_latents(components.vae.encode(video_condition), sample_mode="argmax") for _ in generator
             ]
             latent_condition = torch.cat(latent_condition)
         else:
-            latent_condition = retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax")
+            latent_condition = retrieve_latents(components.vae.encode(video_condition), sample_mode="argmax")
             latent_condition = latent_condition.repeat(batch_size, 1, 1, 1, 1)
 
         latent_condition = latent_condition.to(dtype)
@@ -445,9 +448,13 @@ def _encode_vae_image(
         else:
             mask_lat_size[:, :, list(range(1, num_frames - 1))] = 0
         first_frame_mask = mask_lat_size[:, :, 0:1]
-        first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
+        first_frame_mask = torch.repeat_interleave(
+            first_frame_mask, dim=2, repeats=components.vae_scale_factor_temporal
+        )
         mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
-        mask_lat_size = mask_lat_size.view(batch_size, -1, self.vae_scale_factor_temporal, latent_height, latent_width)
+        mask_lat_size = mask_lat_size.view(
+            batch_size, -1, components.vae_scale_factor_temporal, latent_height, latent_width
+        )
         mask_lat_size = mask_lat_size.transpose(1, 2)
         mask_lat_size = mask_lat_size.to(latent_condition.device)
         latent_condition = torch.concat([mask_lat_size, latent_condition], dim=1)
@@ -458,32 +465,30 @@ def _encode_vae_image(
     def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
         block_state.device = components._execution_device
-        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
-        block_state.num_channels_latents = self.vae.config.z_dim
-        block_state.batch_size = (
-            block_state.batch_size if block_state.batch_size is not None else block_state.image.shape[0]
-        )
+        block_state.num_channels_latents = components.vae.config.z_dim
 
-        block_state.image = self.video_processor.preprocess(
+        block_state.image = components.video_processor.preprocess(
             block_state.image, height=block_state.height, width=block_state.width
         ).to(block_state.device, dtype=torch.float32)
+
         if block_state.last_image is not None:
-            block_state.last_image = self.video_processor.preprocess(
+            block_state.last_image = components.video_processor.preprocess(
                 block_state.last_image, height=block_state.height, width=block_state.width
             ).to(block_state.device, dtype=torch.float32)
 
         block_state.latent_condition = self._encode_vae_image(
             components,
-            batch_size=block_state.batch_size,
-            height=block_state.height,
-            width=block_state.width,
-            num_frames=block_state.num_frames,
-            image=block_state.image,
-            device=block_state.device,
-            dtype=block_state.dtype,
-            last_image=block_state.last_image,
-            generator=block_state.generator,
+            block_state.batch_size,
+            block_state.height,
+            block_state.width,
+            block_state.num_frames,
+            block_state.image,
+            block_state.device,
+            block_state.dtype,
+            block_state.last_image,
+            block_state.generator,
         )
 
         self.set_block_state(state, block_state)
+
         return components, state
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks.py b/src/diffusers/modular_pipelines/wan/modular_blocks.py