address review comments

a-r-r-o-w · a-r-r-o-w · commit 2ff808d26c97 · 2025-08-01T06:49:06.000+02:00
diff --git a/src/diffusers/modular_pipelines/wan/denoise.py b/src/diffusers/modular_pipelines/wan/denoise.py
@@ -35,7 +35,7 @@
 
 
 class WanI2VLoopBeforeDenoiser(PipelineBlock):
-    model_name = "stable-diffusion-xl"
+    model_name = "wan"
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -72,15 +72,15 @@ def intermediate_inputs(self) -> List[str]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                "concatenated_latents",
+                "latent_model_inputs",
                 type_hint=torch.Tensor,
                 description="The concatenated noisy and conditioning latents to use for the denoising process.",
             ),
         ]
 
     @torch.no_grad()
     def __call__(self, components: WanModularPipeline, block_state: BlockState, i: int, t: int):
-        block_state.concatenated_latents = torch.cat([block_state.latents, block_state.latent_condition], dim=1)
+        block_state.latent_model_inputs = torch.cat([block_state.latents, block_state.latent_condition], dim=1)
         return components, block_state
 
 
@@ -215,13 +215,13 @@ def inputs(self) -> List[Tuple[str, Any]]:
     def intermediate_inputs(self) -> List[str]:
         return [
             InputParam(
-                "concatenated_latents",
+                "latent_model_inputs",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial latents to use for the denoising process.",
             ),
             InputParam(
-                "encoder_hidden_states_image",
+                "image_embeds",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The encoder hidden states for the image inputs.",
@@ -272,10 +272,10 @@ def __call__(
             # Predict the noise residual
             # store the noise_pred in guider_state_batch so that we can apply guidance across all batches
             guider_state_batch.noise_pred = components.transformer(
-                hidden_states=block_state.concatenated_latents.to(transformer_dtype),
+                hidden_states=block_state.latent_model_inputs.to(transformer_dtype),
                 timestep=t.flatten(),
                 encoder_hidden_states=prompt_embeds.to(transformer_dtype),
-                encoder_hidden_states_image=block_state.encoder_hidden_states_image.to(transformer_dtype),
+                encoder_hidden_states_image=block_state.image_embeds.to(transformer_dtype),
                 attention_kwargs=block_state.attention_kwargs,
                 return_dict=False,
             )[0]
diff --git a/src/diffusers/modular_pipelines/wan/encoders.py b/src/diffusers/modular_pipelines/wan/encoders.py
@@ -296,7 +296,7 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                "encoder_hidden_states_image",
+                "image_embeds",
                 type_hint=torch.Tensor,
                 description="image embeddings used to guide the image generation",
             ),
@@ -335,7 +335,7 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
         if block_state.last_image is not None:
             image = [block_state.image, block_state.last_image]
 
-        block_state.encoder_hidden_states_image = self.encode_image(components, image, block_state.device)
+        block_state.image_embeds = self.encode_image(components, image, block_state.device)
 
         # Add outputs
         self.set_block_state(state, block_state)