Merge branch master-merge-github0901 into master

Title: Merge branch 'master-github' into master-merge-github0901 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13874155
2025-12-25 20:49:37 +01:00 · 2023-09-01 11:13:19 +08:00
parent 7a1d40b54b 38ffb128c5
commit 7cd75cbe86
10 changed files with 45 additions and 37 deletions
--- a/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py
+++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py
@@ -10,7 +10,8 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.cross_attention import CrossAttention, LoRALinearLayer
+from diffusers.models.attention_processor import Attention
+from diffusers.models.lora import LoRALinearLayer
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.models.resnet import (Downsample2D, Upsample2D, downsample_2d,
                                     partial, upsample_2d)
@@ -467,7 +468,7 @@ class ControlLoRACrossAttnProcessor(LoRACrossAttnProcessor):
        return control_states

    def __call__(self,
-                 attn: CrossAttention,
+                 attn: Attention,
                 hidden_states,
                 encoder_hidden_states=None,
                 attention_mask=None,
@@ -619,7 +620,7 @@ class ControlLoRACrossAttnProcessorV2(LoRACrossAttnProcessor):
        return control_states

    def __call__(self,
-                 attn: CrossAttention,
+                 attn: Attention,
                 hidden_states,
                 encoder_hidden_states=None,
                 attention_mask=None,
--- a/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py
+++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py
@@ -11,7 +11,7 @@ import torch.nn.functional as F
 from diffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline,
                       DPMSolverMultistepScheduler, UNet2DConditionModel,
                       utils)
-from diffusers.models import cross_attention
+from diffusers.models import attention
 from diffusers.utils import deprecation_utils
 from swift import AdapterConfig, LoRAConfig, PromptConfig, Swift
 from transformers import CLIPTextModel, CLIPTokenizer
@@ -30,7 +30,7 @@ from .control_sd_lora import ControlLoRATuner

 utils.deprecate = lambda *arg, **kwargs: None
 deprecation_utils.deprecate = lambda *arg, **kwargs: None
-cross_attention.deprecate = lambda *arg, **kwargs: None
+attention.deprecate = lambda *arg, **kwargs: None

 __tuner_MAP__ = {'lora': LoRATuner, 'control_lora': ControlLoRATuner}

@@ -113,12 +113,10 @@ class EfficientStableDiffusion(TorchModel):
            rank = tuner_config[
                'rank'] if tuner_config and 'rank' in tuner_config else 4
            lora_config = LoRAConfig(
-                rank=rank,
-                replace_modules=['to_q', 'to_k', 'to_v', 'to_out.0'],
+                r=rank,
+                target_modules=['to_q', 'to_k', 'to_v', 'to_out.0'],
                merge_weights=False,
-                only_lora_trainable=False,
-                use_merged_linear=False,
-                pretrained_weights=pretrained_tuner)
+                use_merged_linear=False)
            self.unet = Swift.prepare_model(self.unet, lora_config)
        elif tuner_name == 'swift-adapter':
            adapter_length = tuner_config[
@@ -126,10 +124,8 @@ class EfficientStableDiffusion(TorchModel):
            adapter_config = AdapterConfig(
                dim=-1,
                hidden_pos=0,
-                module_name=r'.*ff\.net\.2$',
-                adapter_length=adapter_length,
-                only_adapter_trainable=False,
-                pretrained_weights=pretrained_tuner)
+                target_modules=r'.*ff\.net\.2$',
+                adapter_length=adapter_length)
            self.unet = Swift.prepare_model(self.unet, adapter_config)
        elif tuner_name == 'swift-prompt':
            prompt_length = tuner_config[
@@ -139,14 +135,11 @@ class EfficientStableDiffusion(TorchModel):
                    320, 320, 640, 640, 1280, 1280, 1280, 1280, 1280, 640, 640,
                    640, 320, 320, 320
                ],
-                module_layer_name=
+                target_modules=
                r'.*[down_blocks|up_blocks|mid_block]\.\d+\.attentions\.\d+\.transformer_blocks\.\d+$',
                embedding_pos=0,
                prompt_length=prompt_length,
-                only_prompt_trainable=False,
-                attach_front=False,
-                pretrained_weights=pretrained_tuner,
-                extract_embedding=True)
+                attach_front=False)
            self.unet = Swift.prepare_model(self.unet, prompt_config)
        elif tuner_name in ('lora', 'control_lora'):
            # if not set the config of control-tuner, we add the lora tuner directly to the original framework,
--- a/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py
+++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py
@@ -8,7 +8,8 @@ from typing import List, Tuple, Union
 import torch
 import torch.nn as nn
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.cross_attention import CrossAttention, LoRALinearLayer
+from diffusers.models.attention_processor import Attention
+from diffusers.models.lora import LoRALinearLayer
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils.outputs import BaseOutput

@@ -84,7 +85,7 @@ class LoRACrossAttnProcessor(nn.Module):
        self.output_states_skipped = is_skipped

    def __call__(self,
-                 attn: CrossAttention,
+                 attn: Attention,
                 hidden_states,
                 encoder_hidden_states=None,
                 attention_mask=None,
--- a/modelscope/models/multi_modal/video_to_video/video_to_video_model.py
+++ b/modelscope/models/multi_modal/video_to_video/video_to_video_model.py
@@ -112,7 +112,7 @@ class VideoToVideo(TorchModel):
        generator.eval()
        load_dict = torch.load(cfg.model_path, map_location='cpu')
        ret = generator.load_state_dict(load_dict['state_dict'], strict=True)
-        self.generator = generator
+        self.generator = generator.half()
        logger.info('Load model {} path {}, with local status {}'.format(
            cfg.UNet.type, cfg.model_path, ret))

@@ -175,7 +175,7 @@ class VideoToVideo(TorchModel):
        video_data = rearrange(video_data, 'b f c h w -> (b f) c h w')

        video_data_list = torch.chunk(
-            video_data, video_data.shape[0] // 2, dim=0)
+            video_data, video_data.shape[0] // 1, dim=0)
        with torch.no_grad():
            decode_data = []
            for vd_data in video_data_list:
@@ -185,6 +185,7 @@ class VideoToVideo(TorchModel):
            video_data_feature = torch.cat(decode_data, dim=0)
            video_data_feature = rearrange(
                video_data_feature, '(b f) c h w -> b c f h w', b=batch_size)
+        torch.cuda.empty_cache()

        with amp.autocast(enabled=True):
            total_noise_levels = 600
@@ -209,6 +210,7 @@ class VideoToVideo(TorchModel):
                t_min=0,
                discretization='trailing')

+            torch.cuda.empty_cache()
            scale_factor = 0.18215
            vid_tensor_feature = 1. / scale_factor * gen_vid

--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -240,7 +240,14 @@ TASK_INPUTS = {
    InputType.IMAGE,
    Tasks.video_embedding:
    InputType.VIDEO,
-    Tasks.virtual_try_on: (InputType.IMAGE, InputType.IMAGE, InputType.IMAGE),
+    Tasks.virtual_try_on: [
+        (InputType.IMAGE, InputType.IMAGE, InputType.IMAGE),
+        {
+            'masked_model': InputType.IMAGE,
+            'pose': InputType.IMAGE,
+            'cloth': InputType.IMAGE,
+        }
+    ],
    Tasks.text_driven_segmentation: {
        InputKeys.IMAGE: InputType.IMAGE,
        InputKeys.TEXT: InputType.TEXT
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -448,9 +448,9 @@ class SeqGPTPipeline(Pipeline):
    # define the forward pass
    def forward(self, prompt: str, **forward_params) -> Dict[str, Any]:
        # gen & decode
-        prompt += '[GEN]'
+        # prompt += '[GEN]'
        input_ids = self.tokenizer(
-            prompt,
+            prompt + '[GEN]',
            return_tensors='pt',
            padding=True,
            truncation=True,
--- a/tests/export/test_export_stable_diffusion.py
+++ b/tests/export/test_export_stable_diffusion.py
@@ -21,7 +21,7 @@ class TestExportStableDiffusion(unittest.TestCase):
            os.makedirs(self.tmp_dir)
        self.model_id = 'AI-ModelScope/stable-diffusion-v1-5'

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_export_stable_diffusion(self):
        model = Model.from_pretrained(self.model_id)
        Exporter.from_model(model).export_onnx(
--- a/tests/pipelines/test_efficient_diffusion_tuning_swift.py
+++ b/tests/pipelines/test_efficient_diffusion_tuning_swift.py
@@ -16,7 +16,7 @@ class EfficientDiffusionTuningTestSwift(unittest.TestCase):
    def setUp(self) -> None:
        self.task = Tasks.efficient_diffusion_tuning

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_lora_run_pipeline(self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-lora'
        model_revision = 'v1.0.2'
@@ -33,7 +33,7 @@ class EfficientDiffusionTuningTestSwift(unittest.TestCase):
            f'Efficient-diffusion-tuning-swift-lora output: {output_image_path}'
        )

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_lora_load_model_from_pretrained(
            self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-lora'
@@ -41,7 +41,7 @@ class EfficientDiffusionTuningTestSwift(unittest.TestCase):
        model = Model.from_pretrained(model_id, model_revision=model_revision)
        self.assertTrue(model.__class__ == EfficientStableDiffusion)

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_adapter_run_pipeline(self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-adapter'
        model_revision = 'v1.0.2'
@@ -58,7 +58,7 @@ class EfficientDiffusionTuningTestSwift(unittest.TestCase):
            f'Efficient-diffusion-tuning-swift-adapter output: {output_image_path}'
        )

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_adapter_load_model_from_pretrained(
            self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-adapter'
@@ -66,7 +66,7 @@ class EfficientDiffusionTuningTestSwift(unittest.TestCase):
        model = Model.from_pretrained(model_id, model_revision=model_revision)
        self.assertTrue(model.__class__ == EfficientStableDiffusion)

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_prompt_run_pipeline(self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-prompt'
        model_revision = 'v1.0.2'
@@ -83,7 +83,7 @@ class EfficientDiffusionTuningTestSwift(unittest.TestCase):
            f'Efficient-diffusion-tuning-swift-prompt output: {output_image_path}'
        )

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_prompt_load_model_from_pretrained(
            self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-prompt'
--- a/tests/pipelines/test_virtual_try_on.py
+++ b/tests/pipelines/test_virtual_try_on.py
@@ -20,7 +20,11 @@ class VirtualTryonTest(unittest.TestCase):
    masked_model = Image.open('data/test/images/virtual_tryon_model.jpg')
    pose = Image.open('data/test/images/virtual_tryon_pose.jpg')
    cloth = Image.open('data/test/images/virtual_tryon_cloth.jpg')
-    input_imgs = (masked_model, pose, cloth)
+    input_imgs = {
+        'masked_model': masked_model,
+        'pose': pose,
+        'cloth': cloth,
+    }

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name(self):
--- a/tests/trainers/test_efficient_diffusion_tuning_trainer_swift.py
+++ b/tests/trainers/test_efficient_diffusion_tuning_trainer_swift.py
@@ -33,7 +33,7 @@ class TestEfficientDiffusionTuningTrainerSwift(unittest.TestCase):
        shutil.rmtree(self.tmp_dir)
        super().tearDown()

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_lora_train(self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-lora'
        model_revision = 'v1.0.2'
@@ -62,7 +62,7 @@ class TestEfficientDiffusionTuningTrainerSwift(unittest.TestCase):
        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
        self.assertIn(f'epoch_{self.max_epochs}.pth', results_files)

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_adapter_train(self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-adapter'
        model_revision = 'v1.0.2'
@@ -91,7 +91,7 @@ class TestEfficientDiffusionTuningTrainerSwift(unittest.TestCase):
        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
        self.assertIn(f'epoch_{self.max_epochs}.pth', results_files)

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_efficient_diffusion_tuning_swift_prompt_train(self):
        model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-prompt'
        model_revision = 'v1.0.2'