Skip to content

Commit b73c738

Browse files
authored
Remove device synchronization when loading weights (#11927)
* update * make style
1 parent 06fd427 commit b73c738

File tree

6 files changed

+6
-24
lines changed

6 files changed

+6
-24
lines changed

src/diffusers/loaders/single_file_model.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from .. import __version__
2525
from ..quantizers import DiffusersAutoQuantizer
2626
from ..utils import deprecate, is_accelerate_available, logging
27-
from ..utils.torch_utils import device_synchronize, empty_device_cache
27+
from ..utils.torch_utils import empty_device_cache
2828
from .single_file_utils import (
2929
SingleFileComponentError,
3030
convert_animatediff_checkpoint_to_diffusers,
@@ -431,10 +431,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
431431
keep_in_fp32_modules=keep_in_fp32_modules,
432432
unexpected_keys=unexpected_keys,
433433
)
434-
# Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
435-
# required because we move tensors with non_blocking=True, which is slightly faster for model loading.
436434
empty_device_cache()
437-
device_synchronize()
438435
else:
439436
_, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
440437

src/diffusers/loaders/single_file_utils.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
)
4747
from ..utils.constants import DIFFUSERS_REQUEST_TIMEOUT
4848
from ..utils.hub_utils import _get_model_file
49-
from ..utils.torch_utils import device_synchronize, empty_device_cache
49+
from ..utils.torch_utils import empty_device_cache
5050

5151

5252
if is_transformers_available():
@@ -1690,10 +1690,7 @@ def create_diffusers_clip_model_from_ldm(
16901690

16911691
if is_accelerate_available():
16921692
load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
1693-
# Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
1694-
# required because we move tensors with non_blocking=True, which is slightly faster for model loading.
16951693
empty_device_cache()
1696-
device_synchronize()
16971694
else:
16981695
model.load_state_dict(diffusers_format_checkpoint, strict=False)
16991696

@@ -2153,10 +2150,7 @@ def create_diffusers_t5_model_from_checkpoint(
21532150

21542151
if is_accelerate_available():
21552152
load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
2156-
# Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
2157-
# required because we move tensors with non_blocking=True, which is slightly faster for model loading.
21582153
empty_device_cache()
2159-
device_synchronize()
21602154
else:
21612155
model.load_state_dict(diffusers_format_checkpoint)
21622156

src/diffusers/loaders/transformer_flux.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
)
2020
from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
2121
from ..utils import is_accelerate_available, is_torch_version, logging
22-
from ..utils.torch_utils import device_synchronize, empty_device_cache
22+
from ..utils.torch_utils import empty_device_cache
2323

2424

2525
if is_accelerate_available():
@@ -82,7 +82,6 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
8282
device_map = {"": self.device}
8383
load_model_dict_into_meta(image_projection, updated_state_dict, device_map=device_map, dtype=self.dtype)
8484
empty_device_cache()
85-
device_synchronize()
8685

8786
return image_projection
8887

@@ -158,7 +157,6 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=_
158157
key_id += 1
159158

160159
empty_device_cache()
161-
device_synchronize()
162160

163161
return attn_procs
164162

src/diffusers/loaders/transformer_sd3.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from ..models.embeddings import IPAdapterTimeImageProjection
1919
from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
2020
from ..utils import is_accelerate_available, is_torch_version, logging
21-
from ..utils.torch_utils import device_synchronize, empty_device_cache
21+
from ..utils.torch_utils import empty_device_cache
2222

2323

2424
logger = logging.get_logger(__name__)
@@ -82,7 +82,6 @@ def _convert_ip_adapter_attn_to_diffusers(
8282
)
8383

8484
empty_device_cache()
85-
device_synchronize()
8685

8786
return attn_procs
8887

@@ -152,7 +151,6 @@ def _convert_ip_adapter_image_proj_to_diffusers(
152151
device_map = {"": self.device}
153152
load_model_dict_into_meta(image_proj, updated_state_dict, device_map=device_map, dtype=self.dtype)
154153
empty_device_cache()
155-
device_synchronize()
156154

157155
return image_proj
158156

src/diffusers/loaders/unet.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
is_torch_version,
4444
logging,
4545
)
46-
from ..utils.torch_utils import device_synchronize, empty_device_cache
46+
from ..utils.torch_utils import empty_device_cache
4747
from .lora_base import _func_optionally_disable_offloading
4848
from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE, TEXT_ENCODER_NAME, UNET_NAME
4949
from .utils import AttnProcsLayers
@@ -755,7 +755,6 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
755755
device_map = {"": self.device}
756756
load_model_dict_into_meta(image_projection, updated_state_dict, device_map=device_map, dtype=self.dtype)
757757
empty_device_cache()
758-
device_synchronize()
759758

760759
return image_projection
761760

@@ -854,7 +853,6 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=_
854853
key_id += 2
855854

856855
empty_device_cache()
857-
device_synchronize()
858856

859857
return attn_procs
860858

src/diffusers/models/modeling_utils.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
load_or_create_model_card,
6363
populate_model_card,
6464
)
65-
from ..utils.torch_utils import device_synchronize, empty_device_cache
65+
from ..utils.torch_utils import empty_device_cache
6666
from .model_loading_utils import (
6767
_caching_allocator_warmup,
6868
_determine_device_map,
@@ -1540,10 +1540,7 @@ def _load_pretrained_model(
15401540
assign_to_params_buffers = check_support_param_buffer_assignment(model, state_dict)
15411541
error_msgs += _load_state_dict_into_model(model, state_dict, assign_to_params_buffers)
15421542

1543-
# Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
1544-
# required because we move tensors with non_blocking=True, which is slightly faster for model loading.
15451543
empty_device_cache()
1546-
device_synchronize()
15471544

15481545
if offload_index is not None and len(offload_index) > 0:
15491546
save_offload_index(offload_index, offload_folder)

0 commit comments

Comments
 (0)