diff --git a/animatediff/utils/convert_from_ckpt.py b/animatediff/utils/convert_from_ckpt.py index 8973eff..9c70b92 100644 --- a/animatediff/utils/convert_from_ckpt.py +++ b/animatediff/utils/convert_from_ckpt.py @@ -48,16 +48,7 @@ from diffusers.schedulers import ( PNDMScheduler, UnCLIPScheduler, ) -# from diffusers.utils import is_omegaconf_available, is_safetensors_available, logging from diffusers.utils.import_utils import BACKENDS_MAPPING -# from diffusers.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel -# from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder -# from diffusers.pipelines.pipeline_utils import DiffusionPipeline -# from .safety_checker import StableDiffusionSafetyChecker -# from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer - - -# logger = logging.get_logger(__name__) # pylint: disable=invalid-name def shave_segments(path, n_shave_prefix_segments=1): @@ -724,8 +715,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config): def convert_ldm_clip_checkpoint(checkpoint): - # text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") - text_model = CLIPTextModel.from_pretrained("/mnt/petrelfs/guoyuwei/projects/huggingface/clip-vit-large-patch14") + text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") keys = list(checkpoint.keys()) text_model_dict = {} @@ -968,415 +958,3 @@ def convert_controlnet_checkpoint( controlnet_model.load_state_dict(converted_ctrl_checkpoint) return controlnet_model - - -# def download_from_original_stable_diffusion_ckpt( -# checkpoint_path: str, -# original_config_file: str = None, -# image_size: int = 512, -# prediction_type: str = None, -# model_type: str = None, -# extract_ema: bool = False, -# scheduler_type: str = "pndm", -# num_in_channels: Optional[int] = None, -# upcast_attention: Optional[bool] = None, -# device: str = None, -# from_safetensors: bool = False, -# stable_unclip: Optional[str] = None, -# stable_unclip_prior: Optional[str] = None, -# clip_stats_path: Optional[str] = None, -# controlnet: Optional[bool] = None, -# load_safety_checker: bool = True, -# pipeline_class: DiffusionPipeline = None, -# ) -> DiffusionPipeline: -# """ -# Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` -# config file. - -# Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the -# global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is -# recommended that you override the default values and/or supply an `original_config_file` wherever possible. - -# Args: -# checkpoint_path (`str`): Path to `.ckpt` file. -# original_config_file (`str`): -# Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically -# inferred by looking for a key that only exists in SD2.0 models. -# image_size (`int`, *optional*, defaults to 512): -# The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2 -# Base. Use 768 for Stable Diffusion v2. -# prediction_type (`str`, *optional*): -# The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable -# Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2. -# num_in_channels (`int`, *optional*, defaults to None): -# The number of input channels. If `None`, it will be automatically inferred. -# scheduler_type (`str`, *optional*, defaults to 'pndm'): -# Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm", -# "ddim"]`. -# model_type (`str`, *optional*, defaults to `None`): -# The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder", -# "FrozenCLIPEmbedder", "PaintByExample"]`. -# is_img2img (`bool`, *optional*, defaults to `False`): -# Whether the model should be loaded as an img2img pipeline. -# extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for -# checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to -# `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for -# inference. Non-EMA weights are usually better to continue fine-tuning. -# upcast_attention (`bool`, *optional*, defaults to `None`): -# Whether the attention computation should always be upcasted. This is necessary when running stable -# diffusion 2.1. -# device (`str`, *optional*, defaults to `None`): -# The device to use. Pass `None` to determine automatically. -# from_safetensors (`str`, *optional*, defaults to `False`): -# If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch. -# load_safety_checker (`bool`, *optional*, defaults to `True`): -# Whether to load the safety checker or not. Defaults to `True`. -# pipeline_class (`str`, *optional*, defaults to `None`): -# The pipeline class to use. Pass `None` to determine automatically. -# return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file. -# """ - -# # import pipelines here to avoid circular import error when using from_ckpt method -# from diffusers import ( -# LDMTextToImagePipeline, -# PaintByExamplePipeline, -# StableDiffusionControlNetPipeline, -# StableDiffusionPipeline, -# StableUnCLIPImg2ImgPipeline, -# StableUnCLIPPipeline, -# ) - -# if pipeline_class is None: -# pipeline_class = StableDiffusionPipeline - -# if prediction_type == "v-prediction": -# prediction_type = "v_prediction" - -# if not is_omegaconf_available(): -# raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - -# from omegaconf import OmegaConf - -# if from_safetensors: -# if not is_safetensors_available(): -# raise ValueError(BACKENDS_MAPPING["safetensors"][1]) - -# from safetensors import safe_open - -# checkpoint = {} -# with safe_open(checkpoint_path, framework="pt", device="cpu") as f: -# for key in f.keys(): -# checkpoint[key] = f.get_tensor(key) -# else: -# if device is None: -# device = "cuda" if torch.cuda.is_available() else "cpu" -# checkpoint = torch.load(checkpoint_path, map_location=device) -# else: -# checkpoint = torch.load(checkpoint_path, map_location=device) - -# # Sometimes models don't have the global_step item -# if "global_step" in checkpoint: -# global_step = checkpoint["global_step"] -# else: -# print("global_step key not found in model") -# global_step = None - -# # NOTE: this while loop isn't great but this controlnet checkpoint has one additional -# # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21 -# while "state_dict" in checkpoint: -# checkpoint = checkpoint["state_dict"] - -# if original_config_file is None: -# key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight" - -# # model_type = "v1" -# config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" - -# if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024: -# # model_type = "v2" -# config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml" - -# if global_step == 110000: -# # v2.1 needs to upcast attention -# upcast_attention = True - -# original_config_file = BytesIO(requests.get(config_url).content) - -# original_config = OmegaConf.load(original_config_file) - -# if num_in_channels is not None: -# original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels - -# if ( -# "parameterization" in original_config["model"]["params"] -# and original_config["model"]["params"]["parameterization"] == "v" -# ): -# if prediction_type is None: -# # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"` -# # as it relies on a brittle global step parameter here -# prediction_type = "epsilon" if global_step == 875000 else "v_prediction" -# if image_size is None: -# # NOTE: For stable diffusion 2 base one has to pass `image_size==512` -# # as it relies on a brittle global step parameter here -# image_size = 512 if global_step == 875000 else 768 -# else: -# if prediction_type is None: -# prediction_type = "epsilon" -# if image_size is None: -# image_size = 512 - -# if controlnet is None: -# controlnet = "control_stage_config" in original_config.model.params - -# if controlnet: -# controlnet_model = convert_controlnet_checkpoint( -# checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema -# ) - -# num_train_timesteps = original_config.model.params.timesteps -# beta_start = original_config.model.params.linear_start -# beta_end = original_config.model.params.linear_end - -# scheduler = DDIMScheduler( -# beta_end=beta_end, -# beta_schedule="scaled_linear", -# beta_start=beta_start, -# num_train_timesteps=num_train_timesteps, -# steps_offset=1, -# clip_sample=False, -# set_alpha_to_one=False, -# prediction_type=prediction_type, -# ) -# # make sure scheduler works correctly with DDIM -# scheduler.register_to_config(clip_sample=False) - -# if scheduler_type == "pndm": -# config = dict(scheduler.config) -# config["skip_prk_steps"] = True -# scheduler = PNDMScheduler.from_config(config) -# elif scheduler_type == "lms": -# scheduler = LMSDiscreteScheduler.from_config(scheduler.config) -# elif scheduler_type == "heun": -# scheduler = HeunDiscreteScheduler.from_config(scheduler.config) -# elif scheduler_type == "euler": -# scheduler = EulerDiscreteScheduler.from_config(scheduler.config) -# elif scheduler_type == "euler-ancestral": -# scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config) -# elif scheduler_type == "dpm": -# scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) -# elif scheduler_type == "ddim": -# scheduler = scheduler -# else: -# raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") - -# # Convert the UNet2DConditionModel model. -# unet_config = create_unet_diffusers_config(original_config, image_size=image_size) -# unet_config["upcast_attention"] = upcast_attention -# unet = UNet2DConditionModel(**unet_config) - -# converted_unet_checkpoint = convert_ldm_unet_checkpoint( -# checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema -# ) - -# unet.load_state_dict(converted_unet_checkpoint) - -# # Convert the VAE model. -# vae_config = create_vae_diffusers_config(original_config, image_size=image_size) -# converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) - -# vae = AutoencoderKL(**vae_config) -# vae.load_state_dict(converted_vae_checkpoint) - -# # Convert the text model. -# if model_type is None: -# model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] -# logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}") - -# if model_type == "FrozenOpenCLIPEmbedder": -# text_model = convert_open_clip_checkpoint(checkpoint) -# tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer") - -# if stable_unclip is None: -# if controlnet: -# pipe = StableDiffusionControlNetPipeline( -# vae=vae, -# text_encoder=text_model, -# tokenizer=tokenizer, -# unet=unet, -# scheduler=scheduler, -# controlnet=controlnet_model, -# safety_checker=None, -# feature_extractor=None, -# requires_safety_checker=False, -# ) -# else: -# pipe = pipeline_class( -# vae=vae, -# text_encoder=text_model, -# tokenizer=tokenizer, -# unet=unet, -# scheduler=scheduler, -# safety_checker=None, -# feature_extractor=None, -# requires_safety_checker=False, -# ) -# else: -# image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components( -# original_config, clip_stats_path=clip_stats_path, device=device -# ) - -# if stable_unclip == "img2img": -# feature_extractor, image_encoder = stable_unclip_image_encoder(original_config) - -# pipe = StableUnCLIPImg2ImgPipeline( -# # image encoding components -# feature_extractor=feature_extractor, -# image_encoder=image_encoder, -# # image noising components -# image_normalizer=image_normalizer, -# image_noising_scheduler=image_noising_scheduler, -# # regular denoising components -# tokenizer=tokenizer, -# text_encoder=text_model, -# unet=unet, -# scheduler=scheduler, -# # vae -# vae=vae, -# ) -# elif stable_unclip == "txt2img": -# if stable_unclip_prior is None or stable_unclip_prior == "karlo": -# karlo_model = "kakaobrain/karlo-v1-alpha" -# prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior") - -# prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") -# prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14") - -# prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler") -# prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config) -# else: -# raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}") - -# pipe = StableUnCLIPPipeline( -# # prior components -# prior_tokenizer=prior_tokenizer, -# prior_text_encoder=prior_text_model, -# prior=prior, -# prior_scheduler=prior_scheduler, -# # image noising components -# image_normalizer=image_normalizer, -# image_noising_scheduler=image_noising_scheduler, -# # regular denoising components -# tokenizer=tokenizer, -# text_encoder=text_model, -# unet=unet, -# scheduler=scheduler, -# # vae -# vae=vae, -# ) -# else: -# raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}") -# elif model_type == "PaintByExample": -# vision_model = convert_paint_by_example_checkpoint(checkpoint) -# tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") -# feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") -# pipe = PaintByExamplePipeline( -# vae=vae, -# image_encoder=vision_model, -# unet=unet, -# scheduler=scheduler, -# safety_checker=None, -# feature_extractor=feature_extractor, -# ) -# elif model_type == "FrozenCLIPEmbedder": -# text_model = convert_ldm_clip_checkpoint(checkpoint) -# # tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") -# tokenizer = CLIPTokenizer.from_pretrained("/mnt/petrelfs/guoyuwei/projects/huggingface/clip-vit-large-patch14") - -# # if load_safety_checker: -# if False: -# safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") -# feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") -# else: -# safety_checker = None -# feature_extractor = None - -# if controlnet: -# pipe = StableDiffusionControlNetPipeline( -# vae=vae, -# text_encoder=text_model, -# tokenizer=tokenizer, -# unet=unet, -# controlnet=controlnet_model, -# scheduler=scheduler, -# safety_checker=safety_checker, -# feature_extractor=feature_extractor, -# ) -# else: -# pipe = pipeline_class( -# vae=vae, -# text_encoder=text_model, -# tokenizer=tokenizer, -# unet=unet, -# scheduler=scheduler, -# safety_checker=safety_checker, -# feature_extractor=feature_extractor, -# ) -# else: -# text_config = create_ldm_bert_config(original_config) -# text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) -# tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") -# pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) - -# return pipe - - -# def download_controlnet_from_original_ckpt( -# checkpoint_path: str, -# original_config_file: str, -# image_size: int = 512, -# extract_ema: bool = False, -# num_in_channels: Optional[int] = None, -# upcast_attention: Optional[bool] = None, -# device: str = None, -# from_safetensors: bool = False, -# ) -> DiffusionPipeline: -# if not is_omegaconf_available(): -# raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) - -# from omegaconf import OmegaConf - -# if from_safetensors: -# if not is_safetensors_available(): -# raise ValueError(BACKENDS_MAPPING["safetensors"][1]) - -# from safetensors import safe_open - -# checkpoint = {} -# with safe_open(checkpoint_path, framework="pt", device="cpu") as f: -# for key in f.keys(): -# checkpoint[key] = f.get_tensor(key) -# else: -# if device is None: -# device = "cuda" if torch.cuda.is_available() else "cpu" -# checkpoint = torch.load(checkpoint_path, map_location=device) -# else: -# checkpoint = torch.load(checkpoint_path, map_location=device) - -# # NOTE: this while loop isn't great but this controlnet checkpoint has one additional -# # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21 -# while "state_dict" in checkpoint: -# checkpoint = checkpoint["state_dict"] - -# original_config = OmegaConf.load(original_config_file) - -# if num_in_channels is not None: -# original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels - -# if "control_stage_config" not in original_config.model.params: -# raise ValueError("`control_stage_config` not present in original config") - -# controlnet_model = convert_controlnet_checkpoint( -# checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema -# ) - -# return controlnet_model diff --git a/animatediff/utils/convert_lora_safetensor_to_diffusers.py b/animatediff/utils/convert_lora_safetensor_to_diffusers.py index 6b8c7c9..0a7a429 100644 --- a/animatediff/utils/convert_lora_safetensor_to_diffusers.py +++ b/animatediff/utils/convert_lora_safetensor_to_diffusers.py @@ -76,14 +76,10 @@ def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32) weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32) curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device) - # lora_dim = weight_up.shape[1] - # curr_layer.weight.data += (1/lora_dim) * alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device) else: weight_up = state_dict[pair_keys[0]].to(torch.float32) weight_down = state_dict[pair_keys[1]].to(torch.float32) curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device) - # lora_dim = weight_up.shape[1] - # curr_layer.weight.data += (1/lora_dim) * alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device) # update visited list for item in pair_keys: