update

2026-04-03 09:46:36 +02:00 · 2023-07-11 14:24:17 +08:00
parent eb1b394746
commit 0cdcc95fb5
2 changed files with 1 additions and 427 deletions
--- a/animatediff/utils/convert_from_ckpt.py
+++ b/animatediff/utils/convert_from_ckpt.py
@@ -48,16 +48,7 @@ from diffusers.schedulers import (
    PNDMScheduler,
    UnCLIPScheduler,
 )
-# from diffusers.utils import is_omegaconf_available, is_safetensors_available, logging
 from diffusers.utils.import_utils import BACKENDS_MAPPING
-# from diffusers.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
-# from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
-# from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-# from .safety_checker import StableDiffusionSafetyChecker
-# from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-
-
-# logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


 def shave_segments(path, n_shave_prefix_segments=1):
@@ -724,8 +715,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config):


 def convert_ldm_clip_checkpoint(checkpoint):
-    # text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
-    text_model = CLIPTextModel.from_pretrained("/mnt/petrelfs/guoyuwei/projects/huggingface/clip-vit-large-patch14")
+    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
    keys = list(checkpoint.keys())

    text_model_dict = {}
@@ -968,415 +958,3 @@ def convert_controlnet_checkpoint(
    controlnet_model.load_state_dict(converted_ctrl_checkpoint)

    return controlnet_model
-
-
-# def download_from_original_stable_diffusion_ckpt(
-#     checkpoint_path: str,
-#     original_config_file: str = None,
-#     image_size: int = 512,
-#     prediction_type: str = None,
-#     model_type: str = None,
-#     extract_ema: bool = False,
-#     scheduler_type: str = "pndm",
-#     num_in_channels: Optional[int] = None,
-#     upcast_attention: Optional[bool] = None,
-#     device: str = None,
-#     from_safetensors: bool = False,
-#     stable_unclip: Optional[str] = None,
-#     stable_unclip_prior: Optional[str] = None,
-#     clip_stats_path: Optional[str] = None,
-#     controlnet: Optional[bool] = None,
-#     load_safety_checker: bool = True,
-#     pipeline_class: DiffusionPipeline = None,
-# ) -> DiffusionPipeline:
-#     """
-#     Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
-#     config file.
-
-#     Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
-#     global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
-#     recommended that you override the default values and/or supply an `original_config_file` wherever possible.
-
-#     Args:
-#         checkpoint_path (`str`): Path to `.ckpt` file.
-#         original_config_file (`str`):
-#             Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
-#             inferred by looking for a key that only exists in SD2.0 models.
-#         image_size (`int`, *optional*, defaults to 512):
-#             The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
-#             Base. Use 768 for Stable Diffusion v2.
-#         prediction_type (`str`, *optional*):
-#             The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
-#             Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
-#         num_in_channels (`int`, *optional*, defaults to None):
-#             The number of input channels. If `None`, it will be automatically inferred.
-#         scheduler_type (`str`, *optional*, defaults to 'pndm'):
-#             Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
-#             "ddim"]`.
-#         model_type (`str`, *optional*, defaults to `None`):
-#             The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
-#             "FrozenCLIPEmbedder", "PaintByExample"]`.
-#         is_img2img (`bool`, *optional*, defaults to `False`):
-#             Whether the model should be loaded as an img2img pipeline.
-#         extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
-#             checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
-#             `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
-#             inference. Non-EMA weights are usually better to continue fine-tuning.
-#         upcast_attention (`bool`, *optional*, defaults to `None`):
-#             Whether the attention computation should always be upcasted. This is necessary when running stable
-#             diffusion 2.1.
-#         device (`str`, *optional*, defaults to `None`):
-#             The device to use. Pass `None` to determine automatically.
-#         from_safetensors (`str`, *optional*, defaults to `False`):
-#             If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
-#         load_safety_checker (`bool`, *optional*, defaults to `True`):
-#             Whether to load the safety checker or not. Defaults to `True`.
-#         pipeline_class (`str`, *optional*, defaults to `None`):
-#             The pipeline class to use. Pass `None` to determine automatically.
-#         return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
-#     """
-
-#     # import pipelines here to avoid circular import error when using from_ckpt method
-#     from diffusers import (
-#         LDMTextToImagePipeline,
-#         PaintByExamplePipeline,
-#         StableDiffusionControlNetPipeline,
-#         StableDiffusionPipeline,
-#         StableUnCLIPImg2ImgPipeline,
-#         StableUnCLIPPipeline,
-#     )
-
-#     if pipeline_class is None:
-#         pipeline_class = StableDiffusionPipeline
-
-#     if prediction_type == "v-prediction":
-#         prediction_type = "v_prediction"
-
-#     if not is_omegaconf_available():
-#         raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-#     from omegaconf import OmegaConf
-
-#     if from_safetensors:
-#         if not is_safetensors_available():
-#             raise ValueError(BACKENDS_MAPPING["safetensors"][1])
-
-#         from safetensors import safe_open
-
-#         checkpoint = {}
-#         with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
-#             for key in f.keys():
-#                 checkpoint[key] = f.get_tensor(key)
-#     else:
-#         if device is None:
-#             device = "cuda" if torch.cuda.is_available() else "cpu"
-#             checkpoint = torch.load(checkpoint_path, map_location=device)
-#         else:
-#             checkpoint = torch.load(checkpoint_path, map_location=device)
-
-#     # Sometimes models don't have the global_step item
-#     if "global_step" in checkpoint:
-#         global_step = checkpoint["global_step"]
-#     else:
-#         print("global_step key not found in model")
-#         global_step = None
-
-#     # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
-#     # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
-#     while "state_dict" in checkpoint:
-#         checkpoint = checkpoint["state_dict"]
-
-#     if original_config_file is None:
-#         key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
-
-#         # model_type = "v1"
-#         config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
-
-#         if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
-#             # model_type = "v2"
-#             config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
-
-#             if global_step == 110000:
-#                 # v2.1 needs to upcast attention
-#                 upcast_attention = True
-
-#         original_config_file = BytesIO(requests.get(config_url).content)
-
-#     original_config = OmegaConf.load(original_config_file)
-
-#     if num_in_channels is not None:
-#         original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-
-#     if (
-#         "parameterization" in original_config["model"]["params"]
-#         and original_config["model"]["params"]["parameterization"] == "v"
-#     ):
-#         if prediction_type is None:
-#             # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
-#             # as it relies on a brittle global step parameter here
-#             prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
-#         if image_size is None:
-#             # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-#             # as it relies on a brittle global step parameter here
-#             image_size = 512 if global_step == 875000 else 768
-#     else:
-#         if prediction_type is None:
-#             prediction_type = "epsilon"
-#         if image_size is None:
-#             image_size = 512
-
-#     if controlnet is None:
-#         controlnet = "control_stage_config" in original_config.model.params
-
-#     if controlnet:
-#         controlnet_model = convert_controlnet_checkpoint(
-#             checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema
-#         )
-
-#     num_train_timesteps = original_config.model.params.timesteps
-#     beta_start = original_config.model.params.linear_start
-#     beta_end = original_config.model.params.linear_end
-
-#     scheduler = DDIMScheduler(
-#         beta_end=beta_end,
-#         beta_schedule="scaled_linear",
-#         beta_start=beta_start,
-#         num_train_timesteps=num_train_timesteps,
-#         steps_offset=1,
-#         clip_sample=False,
-#         set_alpha_to_one=False,
-#         prediction_type=prediction_type,
-#     )
-#     # make sure scheduler works correctly with DDIM
-#     scheduler.register_to_config(clip_sample=False)
-
-#     if scheduler_type == "pndm":
-#         config = dict(scheduler.config)
-#         config["skip_prk_steps"] = True
-#         scheduler = PNDMScheduler.from_config(config)
-#     elif scheduler_type == "lms":
-#         scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-#     elif scheduler_type == "heun":
-#         scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-#     elif scheduler_type == "euler":
-#         scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-#     elif scheduler_type == "euler-ancestral":
-#         scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-#     elif scheduler_type == "dpm":
-#         scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-#     elif scheduler_type == "ddim":
-#         scheduler = scheduler
-#     else:
-#         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-#     # Convert the UNet2DConditionModel model.
-#     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-#     unet_config["upcast_attention"] = upcast_attention
-#     unet = UNet2DConditionModel(**unet_config)
-
-#     converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-#         checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
-#     )
-
-#     unet.load_state_dict(converted_unet_checkpoint)
-
-#     # Convert the VAE model.
-#     vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-#     converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-#     vae = AutoencoderKL(**vae_config)
-#     vae.load_state_dict(converted_vae_checkpoint)
-
-#     # Convert the text model.
-#     if model_type is None:
-#         model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-#         logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
-
-#     if model_type == "FrozenOpenCLIPEmbedder":
-#         text_model = convert_open_clip_checkpoint(checkpoint)
-#         tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
-
-#         if stable_unclip is None:
-#             if controlnet:
-#                 pipe = StableDiffusionControlNetPipeline(
-#                     vae=vae,
-#                     text_encoder=text_model,
-#                     tokenizer=tokenizer,
-#                     unet=unet,
-#                     scheduler=scheduler,
-#                     controlnet=controlnet_model,
-#                     safety_checker=None,
-#                     feature_extractor=None,
-#                     requires_safety_checker=False,
-#                 )
-#             else:
-#                 pipe = pipeline_class(
-#                     vae=vae,
-#                     text_encoder=text_model,
-#                     tokenizer=tokenizer,
-#                     unet=unet,
-#                     scheduler=scheduler,
-#                     safety_checker=None,
-#                     feature_extractor=None,
-#                     requires_safety_checker=False,
-#                 )
-#         else:
-#             image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
-#                 original_config, clip_stats_path=clip_stats_path, device=device
-#             )
-
-#             if stable_unclip == "img2img":
-#                 feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
-
-#                 pipe = StableUnCLIPImg2ImgPipeline(
-#                     # image encoding components
-#                     feature_extractor=feature_extractor,
-#                     image_encoder=image_encoder,
-#                     # image noising components
-#                     image_normalizer=image_normalizer,
-#                     image_noising_scheduler=image_noising_scheduler,
-#                     # regular denoising components
-#                     tokenizer=tokenizer,
-#                     text_encoder=text_model,
-#                     unet=unet,
-#                     scheduler=scheduler,
-#                     # vae
-#                     vae=vae,
-#                 )
-#             elif stable_unclip == "txt2img":
-#                 if stable_unclip_prior is None or stable_unclip_prior == "karlo":
-#                     karlo_model = "kakaobrain/karlo-v1-alpha"
-#                     prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior")
-
-#                     prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-#                     prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
-
-#                     prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler")
-#                     prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
-#                 else:
-#                     raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
-
-#                 pipe = StableUnCLIPPipeline(
-#                     # prior components
-#                     prior_tokenizer=prior_tokenizer,
-#                     prior_text_encoder=prior_text_model,
-#                     prior=prior,
-#                     prior_scheduler=prior_scheduler,
-#                     # image noising components
-#                     image_normalizer=image_normalizer,
-#                     image_noising_scheduler=image_noising_scheduler,
-#                     # regular denoising components
-#                     tokenizer=tokenizer,
-#                     text_encoder=text_model,
-#                     unet=unet,
-#                     scheduler=scheduler,
-#                     # vae
-#                     vae=vae,
-#                 )
-#             else:
-#                 raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
-#     elif model_type == "PaintByExample":
-#         vision_model = convert_paint_by_example_checkpoint(checkpoint)
-#         tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-#         feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
-#         pipe = PaintByExamplePipeline(
-#             vae=vae,
-#             image_encoder=vision_model,
-#             unet=unet,
-#             scheduler=scheduler,
-#             safety_checker=None,
-#             feature_extractor=feature_extractor,
-#         )
-#     elif model_type == "FrozenCLIPEmbedder":
-#         text_model = convert_ldm_clip_checkpoint(checkpoint)
-#         # tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-#         tokenizer = CLIPTokenizer.from_pretrained("/mnt/petrelfs/guoyuwei/projects/huggingface/clip-vit-large-patch14")
-
-#         # if load_safety_checker:
-#         if False:
-#             safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
-#             feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
-#         else:
-#             safety_checker = None
-#             feature_extractor = None
-
-#         if controlnet:
-#             pipe = StableDiffusionControlNetPipeline(
-#                 vae=vae,
-#                 text_encoder=text_model,
-#                 tokenizer=tokenizer,
-#                 unet=unet,
-#                 controlnet=controlnet_model,
-#                 scheduler=scheduler,
-#                 safety_checker=safety_checker,
-#                 feature_extractor=feature_extractor,
-#             )
-#         else:
-#             pipe = pipeline_class(
-#                 vae=vae,
-#                 text_encoder=text_model,
-#                 tokenizer=tokenizer,
-#                 unet=unet,
-#                 scheduler=scheduler,
-#                 safety_checker=safety_checker,
-#                 feature_extractor=feature_extractor,
-#             )
-#     else:
-#         text_config = create_ldm_bert_config(original_config)
-#         text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-#         tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-#         pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-
-#     return pipe
-
-
-# def download_controlnet_from_original_ckpt(
-#     checkpoint_path: str,
-#     original_config_file: str,
-#     image_size: int = 512,
-#     extract_ema: bool = False,
-#     num_in_channels: Optional[int] = None,
-#     upcast_attention: Optional[bool] = None,
-#     device: str = None,
-#     from_safetensors: bool = False,
-# ) -> DiffusionPipeline:
-#     if not is_omegaconf_available():
-#         raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-#     from omegaconf import OmegaConf
-
-#     if from_safetensors:
-#         if not is_safetensors_available():
-#             raise ValueError(BACKENDS_MAPPING["safetensors"][1])
-
-#         from safetensors import safe_open
-
-#         checkpoint = {}
-#         with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
-#             for key in f.keys():
-#                 checkpoint[key] = f.get_tensor(key)
-#     else:
-#         if device is None:
-#             device = "cuda" if torch.cuda.is_available() else "cpu"
-#             checkpoint = torch.load(checkpoint_path, map_location=device)
-#         else:
-#             checkpoint = torch.load(checkpoint_path, map_location=device)
-
-#     # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
-#     # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
-#     while "state_dict" in checkpoint:
-#         checkpoint = checkpoint["state_dict"]
-
-#     original_config = OmegaConf.load(original_config_file)
-
-#     if num_in_channels is not None:
-#         original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-
-#     if "control_stage_config" not in original_config.model.params:
-#         raise ValueError("`control_stage_config` not present in original config")
-
-#     controlnet_model = convert_controlnet_checkpoint(
-#         checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema
-#     )
-
-#     return controlnet_model
--- a/animatediff/utils/convert_lora_safetensor_to_diffusers.py
+++ b/animatediff/utils/convert_lora_safetensor_to_diffusers.py
@@ -76,14 +76,10 @@ def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX
            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
-            # lora_dim = weight_up.shape[1]
-            # curr_layer.weight.data += (1/lora_dim) * alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
        else:
            weight_up = state_dict[pair_keys[0]].to(torch.float32)
            weight_down = state_dict[pair_keys[1]].to(torch.float32)
            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
-            # lora_dim = weight_up.shape[1]
-            # curr_layer.weight.data += (1/lora_dim) * alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)

        # update visited list
        for item in pair_keys: