From 54135311f96be0e8ab00e98e7756e86002bc9673 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 30 Aug 2023 14:58:32 +0800
Subject: [PATCH 01/18] add python311 support for whl

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 98b12888..dbac6e77 100644
--- a/setup.py
+++ b/setup.py
@@ -219,6 +219,7 @@ if __name__ == '__main__':
             'Programming Language :: Python :: 3.8',
             'Programming Language :: Python :: 3.9',
             'Programming Language :: Python :: 3.10',
+            'Programming Language :: Python :: 3.11',
         ],
         license='Apache License 2.0',
         tests_require=parse_requirements('requirements/tests.txt'),

From 690473ce85097dda677f3d0a4c2ce2c7e6d4383b Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Fri, 20 Oct 2023 16:10:54 +0800
Subject: [PATCH 02/18] add freeU model

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14307648


* support sd21, sdxl
---
 modelscope/metainfo.py                        |   1 +
 .../models/multi_modal/freeu/__init__.py      |  22 ++
 .../multi_modal/freeu/free_lunch_utils.py     | 331 ++++++++++++++++++
 modelscope/pipelines/multi_modal/__init__.py  |   4 +-
 .../text_to_image_freeu_pipeline.py           | 138 ++++++++
 tests/pipelines/test_text_to_image_freeu.py   |  57 +++
 6 files changed, 552 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/models/multi_modal/freeu/__init__.py
 create mode 100644 modelscope/models/multi_modal/freeu/free_lunch_utils.py
 create mode 100644 modelscope/pipelines/multi_modal/text_to_image_freeu_pipeline.py
 create mode 100644 tests/pipelines/test_text_to_image_freeu.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index ea56efb5..377ade9b 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -291,6 +291,7 @@ class Pipelines(object):
     image_denoise = 'nafnet-image-denoise'
     image_deblur = 'nafnet-image-deblur'
     image_editing = 'masactrl-image-editing'
+    freeu_stable_diffusion_text2image = 'freeu-stable-diffusion-text2image'
     person_image_cartoon = 'unet-person-image-cartoon'
     ocr_detection = 'resnet18-ocr-detection'
     table_recognition = 'dla34-table-recognition'
diff --git a/modelscope/models/multi_modal/freeu/__init__.py b/modelscope/models/multi_modal/freeu/__init__.py
new file mode 100644
index 00000000..3cd55cf3
--- /dev/null
+++ b/modelscope/models/multi_modal/freeu/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .free_lunch_utils import register_free_upblock2d, register_free_crossattn_upblock2d
+else:
+    _import_structure = {
+        'free_lunch_utils':
+        ['register_free_upblock2d', 'register_free_crossattn_upblock2d']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/multi_modal/freeu/free_lunch_utils.py b/modelscope/models/multi_modal/freeu/free_lunch_utils.py
new file mode 100644
index 00000000..eb5d191f
--- /dev/null
+++ b/modelscope/models/multi_modal/freeu/free_lunch_utils.py
@@ -0,0 +1,331 @@
+# ------------------------------------------------------------------------
+# Modified from https://github.com/ChenyangSi/FreeU/blob/main/demo/free_lunch_utils.py
+# Copyright (c) 2023 TencentARC. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.fft as fft
+from diffusers.utils import is_torch_version
+
+
+def isinstance_str(x: object, cls_name: str):
+    """
+    Checks whether x has any class *named* cls_name in its ancestry.
+    Doesn't require access to the class's implementation.
+
+    Useful for patching!
+    """
+
+    for _cls in x.__class__.__mro__:
+        if _cls.__name__ == cls_name:
+            return True
+
+    return False
+
+
+def Fourier_filter(x, threshold, scale):
+    dtype = x.dtype
+    x = x.type(torch.float32)
+    # FFT
+    x_freq = fft.fftn(x, dim=(-2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-2, -1))
+
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W)).cuda()
+
+    crow, ccol = H // 2, W // 2
+    mask[..., crow - threshold:crow + threshold,
+         ccol - threshold:ccol + threshold] = scale
+    x_freq = x_freq * mask
+
+    # IFFT
+    x_freq = fft.ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real
+
+    x_filtered = x_filtered.type(dtype)
+    return x_filtered
+
+
+def register_upblock2d(model):
+
+    def up_forward(self):
+
+        def forward(hidden_states,
+                    res_hidden_states_tuple,
+                    temb=None,
+                    upsample_size=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states],
+                                          dim=1)
+
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module):
+
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+
+                        return custom_forward
+
+                    if is_torch_version('>=', '1.11.0'):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet),
+                            hidden_states,
+                            temb,
+                            use_reentrant=False)
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb)
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        return forward
+
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, 'UpBlock2D'):
+            upsample_block.forward = up_forward(upsample_block)
+
+
+def register_free_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2):
+
+    def up_forward(self):
+
+        def forward(hidden_states,
+                    res_hidden_states_tuple,
+                    temb=None,
+                    upsample_size=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    hidden_states[:, :640] = hidden_states[:, :640] * self.b1
+                    res_hidden_states = Fourier_filter(
+                        res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    hidden_states[:, :320] = hidden_states[:, :320] * self.b2
+                    res_hidden_states = Fourier_filter(
+                        res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+
+                hidden_states = torch.cat([hidden_states, res_hidden_states],
+                                          dim=1)
+
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module):
+
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+
+                        return custom_forward
+
+                    if is_torch_version('>=', '1.11.0'):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet),
+                            hidden_states,
+                            temb,
+                            use_reentrant=False)
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb)
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        return forward
+
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, 'UpBlock2D'):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)
+
+
+def register_crossattn_upblock2d(model):
+
+    def up_forward(self):
+
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states],
+                                          dim=1)
+
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module, return_dict=None):
+
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+
+                        return custom_forward
+
+                    ckpt_kwargs: Dict[str, Any] = {
+                        'use_reentrant': False
+                    } if is_torch_version('>=', '1.11.0') else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        attention_mask=attention_mask,
+                        encoder_attention_mask=encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        return forward
+
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, 'CrossAttnUpBlock2D'):
+            upsample_block.forward = up_forward(upsample_block)
+
+
+def register_free_crossattn_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2):
+
+    def up_forward(self):
+
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    hidden_states[:, :640] = hidden_states[:, :640] * self.b1
+                    res_hidden_states = Fourier_filter(
+                        res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    hidden_states[:, :320] = hidden_states[:, :320] * self.b2
+                    res_hidden_states = Fourier_filter(
+                        res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+
+                hidden_states = torch.cat([hidden_states, res_hidden_states],
+                                          dim=1)
+
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module, return_dict=None):
+
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+
+                        return custom_forward
+
+                    ckpt_kwargs: Dict[str, Any] = {
+                        'use_reentrant': False
+                    } if is_torch_version('>=', '1.11.0') else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )[0]
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        return forward
+
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, 'CrossAttnUpBlock2D'):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)
diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py
index b5316684..1faa261e 100644
--- a/modelscope/pipelines/multi_modal/__init__.py
+++ b/modelscope/pipelines/multi_modal/__init__.py
@@ -26,6 +26,7 @@ if TYPE_CHECKING:
     from .visual_question_answering_pipeline import VisualQuestionAnsweringPipeline
     from .video_question_answering_pipeline import VideoQuestionAnsweringPipeline
     from .videocomposer_pipeline import VideoComposerPipeline
+    from .text_to_image_freeu_pipeline import FreeUTextToImagePipeline
 else:
     _import_structure = {
         'image_captioning_pipeline': ['ImageCaptioningPipeline'],
@@ -53,7 +54,8 @@ else:
         ['SOONetVideoTemporalGroundingPipeline'],
         'text_to_video_synthesis_pipeline': ['TextToVideoSynthesisPipeline'],
         'multimodal_dialogue_pipeline': ['MultimodalDialoguePipeline'],
-        'videocomposer_pipeline': ['VideoComposerPipeline']
+        'videocomposer_pipeline': ['VideoComposerPipeline'],
+        'text_to_image_freeu_pipeline': ['FreeUTextToImagePipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/multi_modal/text_to_image_freeu_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_freeu_pipeline.py
new file mode 100644
index 00000000..9300554c
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/text_to_image_freeu_pipeline.py
@@ -0,0 +1,138 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal.freeu import (
+    register_free_crossattn_upblock2d, register_free_upblock2d)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['FreeUTextToImagePipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.text_to_image_synthesis,
+    module_name=Pipelines.freeu_stable_diffusion_text2image)
+class FreeUTextToImagePipeline(Pipeline):
+
+    def __init__(self, model=str, preprocessor=None, **kwargs):
+        """  FreeU Text to Image Pipeline.
+
+        Examples:
+
+        >>> import cv2
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.utils.constant import Tasks
+
+        >>> prompt = "a photo of a running corgi"  # prompt
+        >>> output_image_path = './result.png'
+        >>> inputs = {'prompt': prompt}
+        >>>
+        >>> pipe = pipeline(
+        >>>     Tasks.text_to_image_synthesis,
+        >>>     model='damo/multi-modal_freeu_stable_diffusion',
+        >>>     base_model='AI-ModelScope/stable-diffusion-v1-5',
+        >>> )
+        >>>
+        >>> output = pipe(inputs)['output_imgs']
+        >>> cv2.imwrite(output_image_path, output)
+        >>> print('pipeline: the output image path is {}'.format(output_image_path))
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        torch_dtype = kwargs.get('torch_dtype', torch.float32)
+        self._device = getattr(
+            kwargs, 'device',
+            torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+        base_model = kwargs.get(
+            'base_model', 'AI-ModelScope/stable-diffusion-v1-5')  # default 1.5
+        self.freeu_params = kwargs.get('freeu_params', {
+            'b1': 1.5,
+            'b2': 1.6,
+            's1': 0.9,
+            's2': 0.2
+        })  # default
+
+        logger.info('load freeu stable diffusion text to image pipeline done')
+        self.pipeline = pipeline(
+            task=Tasks.text_to_image_synthesis,
+            model=base_model,
+            torch_dtype=torch_dtype,
+            device=self._device).pipeline
+
+    def preprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return inputs
+
+    def forward(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        """
+        Inputs Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+        """
+        if not isinstance(inputs, dict):
+            raise ValueError(
+                f'Expected the input to be a dictionary, but got {type(inputs)}'
+            )
+        # -------- freeu block registration
+        register_free_upblock2d(self.pipeline, **self.freeu_params)
+        register_free_crossattn_upblock2d(self.pipeline, **self.freeu_params)
+        # -------- freeu block registration
+
+        output = self.pipeline(
+            prompt=inputs.get('prompt'),
+            height=inputs.get('height'),
+            width=inputs.get('width'),
+            num_inference_steps=inputs.get('num_inference_steps', 50),
+            guidance_scale=inputs.get('guidance_scale', 7.5),
+            negative_prompt=inputs.get('negative_prompt'),
+            num_images_per_prompt=inputs.get('num_images_per_prompt', 1),
+            eta=inputs.get('eta', 0.0),
+            generator=inputs.get('generator'),
+            latents=inputs.get('latents'),
+        ).images[0]
+
+        return {'output_tensor': output}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_img = np.array(inputs['output_tensor'])
+        return {OutputKeys.OUTPUT_IMGS: output_img[:, :, ::-1]}
diff --git a/tests/pipelines/test_text_to_image_freeu.py b/tests/pipelines/test_text_to_image_freeu.py
new file mode 100644
index 00000000..7aebe318
--- /dev/null
+++ b/tests/pipelines/test_text_to_image_freeu.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.multi_modal import FreeUTextToImagePipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ImageEditingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_image_synthesis
+        self.model_id = 'damo/multi-modal_freeu_stable_diffusion'
+        prompt = 'a photo of a running corgi'  # prompt
+        self.inputs = {'prompt': prompt}
+        self.output_image_path = './result.png'
+        self.base_model = 'AI-ModelScope/stable-diffusion-v2-1'
+        self.freeu_params = {
+            'b1': 1.4,
+            'b2': 1.6,
+            's1': 0.9,
+            's2': 0.2
+        }  # for SD2.1
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = FreeUTextToImagePipeline(cache_path)
+        pipeline.group_key = self.task
+        synthesized_img = pipeline(
+            input=self.inputs)[OutputKeys.OUTPUT_IMGS]  # BGR
+        cv2.imwrite(self.output_image_path, synthesized_img)
+        print('FreeU pipeline: the synthesized image path is {}'.format(
+            self.output_image_path))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text_to_image_synthesis,
+            model=self.model_id,
+            base_model=self.base_model,
+            freeu_params=self.freeu_params)
+        synthesized_img = pipeline_ins(
+            self.inputs)[OutputKeys.OUTPUT_IMGS]  # BGR
+        cv2.imwrite(self.output_image_path, synthesized_img)
+        print('FreeU pipeline: the synthesized image path is {}'.format(
+            self.output_image_path))
+
+
+if __name__ == '__main__':
+    unittest.main()

From fb7328f4ec34cf5f4c6478f11846b740f24e1e1d Mon Sep 17 00:00:00 2001
From: "zhangyanzhao.zyz" <zhangyanzhao.zyz@alibaba-inc.com>
Date: Fri, 20 Oct 2023 19:56:01 +0800
Subject: [PATCH 03/18] =?UTF-8?q?=E6=9B=B4=E6=96=B0sentence=20embedding=20?=
 =?UTF-8?q?model=EF=BC=8C=E6=94=AF=E6=8C=81gte=EF=BC=8Cbloom=20sentence=20?=
 =?UTF-8?q?embedding?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14375781

* fix linter

* bloom embedding
---
 .../models/nlp/bert/sentence_embedding.py     |  15 +-
 modelscope/models/nlp/bloom/__init__.py       |   2 +
 .../models/nlp/bloom/sentence_embedding.py    | 165 ++++++++++++++++++
 .../nlp/sentence_embedding_preprocessor.py    | 103 ++++++++++-
 tests/pipelines/test_sentence_embedding.py    |   9 +
 5 files changed, 286 insertions(+), 8 deletions(-)
 create mode 100644 modelscope/models/nlp/bloom/sentence_embedding.py

diff --git a/modelscope/models/nlp/bert/sentence_embedding.py b/modelscope/models/nlp/bert/sentence_embedding.py
index 92a9da50..b7df5ef9 100644
--- a/modelscope/models/nlp/bert/sentence_embedding.py
+++ b/modelscope/models/nlp/bert/sentence_embedding.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 
 from modelscope.metainfo import Models
@@ -61,8 +62,9 @@ class BertForSentenceEmbedding(BertPreTrainedModel):
     def __init__(self, config, **kwargs):
         super().__init__(config)
         self.config = config
-        self.pooler_type = kwargs.get('pooler_type', 'cls')
+        self.pooler_type = kwargs.get('emb_pooler_type', 'cls')
         self.pooler = Pooler(self.pooler_type)
+        self.normalize = kwargs.get('normalize', False)
         setattr(self, self.base_model_prefix,
                 BertModel(config, add_pooling_layer=False))
 
@@ -128,6 +130,8 @@ class BertForSentenceEmbedding(BertPreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict)
         outputs = self.pooler(outputs, attention_mask)
+        if self.normalize:
+            outputs = F.normalize(outputs, p=2, dim=-1)
         return outputs
 
     @classmethod
@@ -142,8 +146,11 @@ class BertForSentenceEmbedding(BertPreTrainedModel):
             The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
         """
         model_dir = kwargs.get('model_dir')
-        model = super(
-            Model,
-            cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        model_kwargs = {
+            'emb_pooler_type': kwargs.get('emb_pooler_type', 'cls'),
+            'normalize': kwargs.get('normalize', False)
+        }
+        model = super(Model, cls).from_pretrained(
+            pretrained_model_name_or_path=model_dir, **model_kwargs)
         model.model_dir = model_dir
         return model
diff --git a/modelscope/models/nlp/bloom/__init__.py b/modelscope/models/nlp/bloom/__init__.py
index b0f04af7..24d7202d 100644
--- a/modelscope/models/nlp/bloom/__init__.py
+++ b/modelscope/models/nlp/bloom/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .backbone import BloomModel
     from .text_generation import BloomForTextGeneration
+    from .sentence_embedding import BloomForSentenceEmbedding
 else:
     _import_structure = {
         'backbone': ['BloomModel'],
         'text_generation': ['BloomForTextGeneration'],
+        'sentence_embedding': ['BloomForSentenceEmbedding']
     }
     import sys
     sys.modules[__name__] = LazyImportModule(
diff --git a/modelscope/models/nlp/bloom/sentence_embedding.py b/modelscope/models/nlp/bloom/sentence_embedding.py
new file mode 100644
index 00000000..ec35db38
--- /dev/null
+++ b/modelscope/models/nlp/bloom/sentence_embedding.py
@@ -0,0 +1,165 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+from transformers import BloomConfig
+from transformers import BloomModel as BloomModelTransform
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.outputs import SentencEmbeddingModelOutput
+from modelscope.utils.constant import Tasks
+
+
+class DecoderPooler(torch.nn.Module):
+    """
+    Parameter-free poolers to get the sentence embedding
+    'last': the last token state.
+    'weighted_mean': position weighted average of all token states.
+    """
+
+    def __init__(self, pooler_type):
+        super().__init__()
+        self.pooler_type = pooler_type
+        assert self.pooler_type in [
+            'last', 'weighted_mean'
+        ], 'unrecognized pooling type %s' % self.pooler_type
+
+    def forward(self, outputs, attention_mask):
+        last_hidden = outputs.last_hidden_state
+
+        if self.pooler_type in ['last']:
+            n, l, h = last_hidden.shape
+
+            # Get shape [n] indices of the last token (i.e. the last token for each batch item)
+            # Any sequence where min == 1, we use the entire sequence lenth since argmin = 0
+            values, indices = torch.min(attention_mask, 1, keepdim=False)
+            gather_indices = torch.where(values == 0, indices,
+                                         l) - 1  # Shape [n]
+
+            # There are empty sequences, where the index would become -1 which will crash
+            gather_indices = torch.clamp(gather_indices, min=0)
+
+            # Turn indices from shape [n] --> [n, 1, h]
+            gather_indices = gather_indices.unsqueeze(1).unsqueeze(1).expand(
+                n, 1, h)
+
+            # Gather along the 1st dim (l) (n, l, h -> n, h)
+            pooled_output = torch.gather(last_hidden, 1,
+                                         gather_indices).squeeze(dim=1)
+
+        elif self.pooler_type == 'weighted_mean':
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(
+                last_hidden.size()).float()
+            # last_hidden shape: bs, seq, hidden_dim
+            weights = (
+                torch.arange(start=1, end=last_hidden.shape[1]
+                             + 1).unsqueeze(0).unsqueeze(-1).expand(
+                                 last_hidden.size()).float().to(
+                                     last_hidden.device))
+            assert weights.shape == last_hidden.shape == input_mask_expanded.shape
+            input_mask_expanded = input_mask_expanded * weights
+
+            sum_embeddings = torch.sum(last_hidden * input_mask_expanded, 1)
+            sum_mask = input_mask_expanded.sum(1)
+            sum_mask = torch.clamp(sum_mask, min=1e-9)
+            pooled_output = sum_embeddings / sum_mask
+
+        else:
+            raise NotImplementedError
+
+        return pooled_output
+
+
+@MODELS.register_module(
+    group_key=Tasks.sentence_embedding, module_name=Models.bloom)
+class BloomForSentenceEmbedding(BloomModelTransform, TorchModel):
+    r"""
+    This model represent a text to a dense vector by the last token state or weighted mean of all token states.
+    See `Language Models are Universal Embedders
+    <https://arxiv.org/pdf/2310.08232.pdf>`_ for details.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.pooler_type = kwargs.get('emb_pooler_type', 'weighted_mean')
+        self.pooler = DecoderPooler(self.pooler_type)
+        self.normalize = kwargs.get('normalize', False)
+        setattr(self, self.base_model_prefix, BloomModelTransform(config))
+
+    def forward(self, query=None, docs=None, labels=None):
+        r"""
+        Args:
+            query (:obj: `dict`): Dict of pretrained models's input for the query sequence. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+            docs (:obj: `dict`): Dict of pretrained models's input for the query sequence. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+        Returns:
+            Returns `modelscope.outputs.SentencEmbeddingModelOutput
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_udever_bloom_560m')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_udever_bloom_560m')
+            >>> inputs = preprocessor({'source_sentence': ['This is a test']})
+            >>> outputs = model(**inputs)
+            >>> print(outputs)
+        """
+        query_embeddings, doc_embeddings = None, None
+        if query is not None:
+            query_embeddings = self.encode(**query)
+        if docs is not None:
+            doc_embeddings = self.encode(**docs)
+        outputs = SentencEmbeddingModelOutput(
+            query_embeddings=query_embeddings, doc_embeddings=doc_embeddings)
+        if query_embeddings is None or doc_embeddings is None:
+            return outputs
+        if self.base_model.training:
+            loss_fct = torch.nn.CrossEntropyLoss()
+            scores = torch.matmul(query_embeddings, doc_embeddings.T)
+            if labels is None:
+                labels = torch.arange(
+                    scores.size(0), device=scores.device, dtype=torch.long)
+                labels = labels * (
+                    doc_embeddings.size(0) // query_embeddings.size(0))
+            loss = loss_fct(scores, labels)
+            outputs.loss = loss
+        return outputs
+
+    def encode(
+        self,
+        input_ids=None,
+        attention_mask=None,
+    ):
+        outputs = self.base_model.forward(
+            input_ids, attention_mask=attention_mask)
+        embeddings = self.pooler(outputs, attention_mask)
+        if self.normalize:
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
+        return embeddings
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_dir = kwargs.get('model_dir')
+        model_kwargs = {
+            'emb_pooler_type': kwargs.get('emb_pooler_type', 'weighted_mean'),
+            'normalize': kwargs.get('normalize', False)
+        }
+        if model_dir is None:
+            config = BloomConfig(**kwargs)
+            model = cls(config)
+        else:
+            model = super(BloomModelTransform, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        model.model_dir = model_dir
+        return model
diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
index b03268c6..f1ca6685 100644
--- a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
+++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
@@ -1,14 +1,19 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict
+from typing import Any, Dict, Optional
+
+import torch
 
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.hub import get_model_type
+from modelscope.utils.logger import get_logger
 from .transformers_tokenizer import NLPTokenizer
 
+logger = get_logger()
+
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sentence_embedding)
@@ -46,9 +51,32 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
         self.max_length = max_length
         if model_dir is not None:
             model_type = get_model_type(model_dir)
+        # we could add `boq/bod` token/prompt and `eoq/eod` token if they exist when tokenizing.
+        for k in ('boq', 'eoq', 'bod', 'eod'):
+            setattr(self, k, kwargs.pop(k, None))
         self.nlp_tokenizer = NLPTokenizer(
             model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
         super().__init__(mode=mode)
+        tokenizer = self.nlp_tokenizer.tokenizer
+        # For tokenizers like bloom
+        if tokenizer.padding_side != 'right':
+            # weighted mean pooling need pad right
+            logger.warning(
+                f'Change tokenizer.padding_side from {tokenizer.padding_side} to right'
+            )
+            tokenizer.padding_side = 'right'
+        # For decoder-only tokenizers
+        if tokenizer.pad_token is None:
+            logger.warning(
+                f'Set tokenizer.pad_token as eos_token {tokenizer.eos_token}')
+            tokenizer.pad_token = tokenizer.eos_token
+        # Currently eos is single token, we can extend to prompt later.
+        for k in ('eoq', 'eod'):
+            v = getattr(self, k, None)
+            if v is not None:
+                v = tokenizer.convert_tokens_to_ids(v)
+            setattr(self, k + '_id', v)
+        self.pad_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
 
     def __call__(self,
                  data: Dict,
@@ -81,13 +109,80 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
         if 'return_tensors' not in kwargs:
             kwargs[
                 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
-        query_inputs = self.nlp_tokenizer(
-            source_sentences, padding=padding, truncation=truncation, **kwargs)
+        query_inputs = self.tokenize(
+            source_sentences,
+            is_query=True,
+            padding=padding,
+            truncation=truncation,
+            **kwargs)
         tokenized_inputs = {'query': query_inputs, 'docs': None}
         if compare_sentences is not None and len(compare_sentences) > 0:
-            tokenized_inputs['docs'] = self.nlp_tokenizer(
+            tokenized_inputs['docs'] = self.tokenize(
                 compare_sentences,
+                is_query=kwargs.get('symmetric', False),
                 padding=padding,
                 truncation=truncation,
                 **kwargs)
         return tokenized_inputs
+
+    def tokenize(self, texts, is_query=True, return_tensors=None, **kwargs):
+        """Tokenize raw texts, add `boq/bod` token/prompt and `eoq/eod` token if they exist.
+
+        Args:
+            `texts` List[str]: texts to tokenize,
+                Example:
+                    ["how long it take to get a master's degree"]
+            `is_query` bool: whether the input text(s) is query.
+            `return_tensors` str: the `return_tensors` argument to tokenizer.
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        if is_query:
+            bos, eos_id = self.boq, self.eoq_id
+        else:
+            bos, eos_id = self.bod, self.eod_id
+        if bos is not None:
+            # bos can be prompt
+            texts = [bos + t for t in texts]
+        encoding = self.nlp_tokenizer(
+            texts, return_tensors=return_tensors, **kwargs)
+        if eos_id is not None:
+            if return_tensors == 'pt':
+                self.add_eos_pt(encoding, eos_id)
+            else:
+                self.add_eos(encoding, eos_id)
+        return encoding
+
+    def add_eos_pt(self, encoding: Dict[str, torch.Tensor], eos: int):
+        """Add `eos` token id to the end of each sequence."""
+        input_ids, attn_mask = encoding['input_ids'], encoding[
+            'attention_mask']
+        batch = torch.arange(input_ids.size(0))
+        length = attn_mask.sum(-1)
+
+        if input_ids.size(1) < self.max_length:
+            ones = input_ids.new_ones(input_ids.size(0), 1)
+            attn_mask = torch.cat((ones, attn_mask), dim=1)
+            padding = ones * self.pad_id
+            input_ids = torch.cat((input_ids, padding), dim=1)
+            eos_index = length
+        else:
+            eos_index = torch.clamp(length, max=self.max_length - 1)
+            attn_mask[batch, eos_index] = 1
+        input_ids[batch, eos_index] = eos
+        encoding['input_ids'], encoding[
+            'attention_mask'] = input_ids, attn_mask
+        return
+
+    def add_eos(self, encoding: Dict[str, list], eos: int):
+        """Add `eos` token id to the end of each sequence."""
+        for ids, mask in zip(encoding['input_ids'],
+                             encoding['attention_mask']):
+            if len(mask) < self.max_length:
+                ids.append(eos)
+                mask.append(1)
+            else:
+                last = min(sum(mask), self.max_length - 1)
+                ids[last] = eos
+                mask[last] = 1
+        return
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
index 13260132..a6dd89ec 100644
--- a/tests/pipelines/test_sentence_embedding.py
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -21,6 +21,7 @@ class SentenceEmbeddingTest(unittest.TestCase):
     medical_tiny_model_id = 'damo/nlp_corom_sentence-embedding_chinese-tiny-medical'
     general_base_model_id = 'damo/nlp_corom_sentence-embedding_chinese-base'
     general_tiny_model_id = 'damo/nlp_corom_sentence-embedding_chinese-tiny'
+    bloom_model_id = 'damo/udever-bloom-7b1'
 
     inputs = {
         'source_sentence': ["how long it take to get a master's degree"],
@@ -154,6 +155,14 @@ class SentenceEmbeddingTest(unittest.TestCase):
         print()
         print(f'pipeline2: {pipeline2(input=self.medical_inputs1)}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_bloom_model_from_modelhub(self):
+        model = Model.from_pretrained(self.bloom_model_id)
+        tokenizer = SentenceEmbeddingTransformersPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)

From 77aa33ad5d22f3af6e2d2d23dce13d58c15e4130 Mon Sep 17 00:00:00 2001
From: "suluyan.sly" <suluyan.sly@alibaba-inc.com>
Date: Mon, 23 Oct 2023 16:44:43 +0800
Subject: [PATCH 04/18] [swingdeploy]sentence_embedding Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14395686 *
 swingdeploy_sentence_embedding

---
 modelscope/pipelines/builder.py       |  1 +
 modelscope/utils/pipeline_inputs.json | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 525bc92c..2de436bb 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -121,6 +121,7 @@ def pipeline(task: str = None,
         ignore_file_pattern=ignore_file_pattern)
     if pipeline_name is None and kwargs.get('llm_first'):
         pipeline_name = llm_first_checker(model, model_revision)
+        kwargs.pop('llm_first')
     pipeline_props = {'type': pipeline_name}
     if pipeline_name is None:
         # get default pipeline for this task
diff --git a/modelscope/utils/pipeline_inputs.json b/modelscope/utils/pipeline_inputs.json
index 03a00636..c75c8b9c 100644
--- a/modelscope/utils/pipeline_inputs.json
+++ b/modelscope/utils/pipeline_inputs.json
@@ -145,6 +145,19 @@
             "image":"http://modelscope.oss-cn-beijing.aliyuncs.com/demo/images/image_salient_detection.jpg"
         }
     },
+    "sentence-embedding":{
+        "input": {
+            "source_sentence":[
+                "吃完海鲜可以喝牛奶吗?"
+            ],
+            "sentences_to_compare":[
+                "不可以，早晨喝牛奶不科学",
+                "吃了海鲜后是不能再喝牛奶的，因为牛奶中含得有维生素C，如果海鲜喝牛奶一起服用会对人体造成一定的伤害",
+                "吃海鲜是不能同时喝牛奶吃水果，这个至少间隔6小时以上才可以。",
+                "吃海鲜是不可以吃柠檬的因为其中的维生素C会和海鲜中的矿物质形成砷"
+            ]
+        }
+    },
     "shop-segmentation":{
         "input":{
             "image":"http://modelscope.oss-cn-beijing.aliyuncs.com/demo/images/shop_segmentation.jpg"

From cc38523e5232339ed5097ef41e1f623cf9b48e38 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 23 Oct 2023 19:32:22 +0800
Subject: [PATCH 05/18] [to #52738068] fix:  fix model version in production
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14394239

* fix model version in production

* add more info in log
---
 modelscope/hub/api.py | 52 ++++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index c6a9162a..f83defd0 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -475,35 +475,37 @@ class HubApi:
                 raise NotExistError('The model: %s has no revision : %s .' % (model_id, revision))
             logger.info('Development mode use revision: %s' % revision)
         else:
-            if revision is None:  # user not specified revision, use latest revision before release time
-                revisions = self.list_model_revisions(
-                    model_id,
-                    cutoff_timestamp=release_timestamp,
-                    use_cookies=False if cookies is None else cookies)
-                if len(revisions) == 0:
-                    logger.warning(('There is no version specified and there is no version in the model repository,'
-                                    'use the master branch, which is fragile, please use it with caution!'))
+            all_revisions = self.list_model_revisions(
+                model_id,
+                cutoff_timestamp=current_timestamp,
+                use_cookies=False if cookies is None else cookies)
+            if len(all_revisions) == 0:
+                if revision is None or revision == MASTER_MODEL_BRANCH:
                     revision = MASTER_MODEL_BRANCH
                 else:
-                    # tags (revisions) returned from backend are guaranteed to be ordered by create-time
-                    # we shall obtain the latest revision created earlier than release version of this branch
-                    revision = revisions[0]
-                logger.info(
-                    'Model revision not specified, use revision: %s'
-                    % revision)
+                    raise NotExistError('The model: %s has no revision: %s !' % (model_id, revision))
             else:
-                # use user-specified revision
-                revisions = self.list_model_revisions(
-                    model_id,
-                    cutoff_timestamp=current_timestamp,
-                    use_cookies=False if cookies is None else cookies)
-                if revision not in revisions:
-                    if revision == MASTER_MODEL_BRANCH:
-                        logger.warning('Using the master branch is fragile, please use it with caution!')
+                if revision is None:  # user not specified revision, use latest revision before release time
+                    revisions = self.list_model_revisions(
+                        model_id,
+                        cutoff_timestamp=release_timestamp,
+                        use_cookies=False if cookies is None else cookies)
+                    if len(revisions) > 0:
+                        revision = revisions[0]  # use latest revision before release time.
                     else:
-                        raise NotExistError('The model: %s has no revision: %s !' %
-                                            (model_id, revision))
-                logger.info('Use user-specified model revision: %s' % revision)
+                        vl = '[%s]' % ','.join(all_revisions)
+                        raise NoValidRevisionError('Model revision should be specified from revisions: %s' % (vl))
+                    logger.warning('Model revision not specified, use revision: %s' % revision)
+                else:
+                    # use user-specified revision
+                    if revision not in all_revisions:
+                        if revision == MASTER_MODEL_BRANCH:
+                            logger.warning('Using the master branch is fragile, please use it with caution!')
+                        else:
+                            vl = '[%s]' % ','.join(all_revisions)
+                            raise NotExistError('The model: %s has no revision: %s valid are: %s!' %
+                                                (model_id, revision, vl))
+                    logger.info('Use user-specified model revision: %s' % revision)
         return revision
 
     def get_model_branches_and_tags(

From d30ef8b2024e4d1910b4923318cfa34f3997d9c8 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 24 Oct 2023 15:18:55 +0800
Subject: [PATCH 06/18] fix huggingface position_ids compatible issue Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14406558 * fix
 compatible issues

* fix transformer compatible issue

* skip case for huggingface link issue

* fix hf autotokenlizer case

* Merge branch 'fix_ci_issue' of http://gitlab.alibaba-inc.com/Ali-MaaS/MaaS-lib into fix_ci_issue
---
 .../cv/controllable_image_generation/controlnet.py       | 8 +++++++-
 modelscope/models/cv/shop_segmentation/head_fpn.py       | 4 ++--
 modelscope/models/cv/shop_segmentation/models.py         | 4 ++--
 modelscope/models/cv/shop_segmentation/neck_fpn.py       | 4 ++--
 .../models/nlp/dgds/document_grounded_dialog_generate.py | 4 ++++
 .../nlp/dgds/document_grounded_dialog_retrieval.py       | 4 ++++
 .../nlp/task_models/machine_reading_comprehension.py     | 9 ++++++---
 tests/pipelines/test_controllable_image_generation.py    | 3 ++-
 tests/utils/test_hf_util.py                              | 4 ++--
 9 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/modelscope/models/cv/controllable_image_generation/controlnet.py b/modelscope/models/cv/controllable_image_generation/controlnet.py
index 9df152b3..5418a39e 100644
--- a/modelscope/models/cv/controllable_image_generation/controlnet.py
+++ b/modelscope/models/cv/controllable_image_generation/controlnet.py
@@ -22,6 +22,8 @@ from modelscope.metainfo import Models
 from modelscope.models.base import Tensor
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.utils.compatible_with_transformers import \
+    compatible_position_ids
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
@@ -88,7 +90,11 @@ class ControlNet(TorchModel):
         if device == 'gpu':
             device = 'cuda'
         model = create_model(yaml_path).cpu()
-        model.load_state_dict(load_state_dict(ckpt_path, location=device))
+        state_dict = load_state_dict(ckpt_path, location=device)
+        compatible_position_ids(
+            state_dict,
+            'cond_stage_model.transformer.text_model.embeddings.position_ids')
+        model.load_state_dict(state_dict)
         self.model = model.to(device)
         self.ddim_sampler = DDIMSampler(self.model)
 
diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py
index 0d4027cb..dfa284d4 100644
--- a/modelscope/models/cv/shop_segmentation/head_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -9,8 +9,8 @@ import numpy as np
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
-from timm.models.layers.drop import drop_path
-from timm.models.layers.weight_init import trunc_normal_
+from timm.layers.drop import drop_path
+from timm.layers.weight_init import trunc_normal_
 
 from .common import Upsample, resize
 
diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
index a206e9f1..1b07a08c 100644
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -11,8 +11,8 @@ from collections import OrderedDict
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from timm.models.layers.drop import drop_path
-from timm.models.layers.weight_init import trunc_normal_
+from timm.layers.drop import drop_path
+from timm.layers.weight_init import trunc_normal_
 from torch import nn
 
 
diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py
index d344de71..12c11d76 100644
--- a/modelscope/models/cv/shop_segmentation/neck_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -8,8 +8,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
-from timm.models.layers.drop import drop_path
-from timm.models.layers.weight_init import trunc_normal_
+from timm.layers.drop import drop_path
+from timm.layers.weight_init import trunc_normal_
 
 from .common import resize
 
diff --git a/modelscope/models/nlp/dgds/document_grounded_dialog_generate.py b/modelscope/models/nlp/dgds/document_grounded_dialog_generate.py
index 7c2f6327..27902b67 100644
--- a/modelscope/models/nlp/dgds/document_grounded_dialog_generate.py
+++ b/modelscope/models/nlp/dgds/document_grounded_dialog_generate.py
@@ -6,6 +6,8 @@ import torch
 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.utils.compatible_with_transformers import \
+    compatible_position_ids
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from .backbone import Re2GModel
@@ -24,6 +26,8 @@ class DocumentGroundedDialogGenerateModel(TorchModel):
         state_dict = torch.load(
             os.path.join(self.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
             map_location='cpu')
+        compatible_position_ids(
+            state_dict, 'rerank.encoder.roberta.embeddings.position_ids')
         self.model.load_state_dict(state_dict)
 
     def forward(self, input: Dict[str, Tensor]):
diff --git a/modelscope/models/nlp/dgds/document_grounded_dialog_retrieval.py b/modelscope/models/nlp/dgds/document_grounded_dialog_retrieval.py
index bd8e05d6..07685673 100644
--- a/modelscope/models/nlp/dgds/document_grounded_dialog_retrieval.py
+++ b/modelscope/models/nlp/dgds/document_grounded_dialog_retrieval.py
@@ -6,6 +6,8 @@ import torch
 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.utils.compatible_with_transformers import \
+    compatible_position_ids
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from .backbone import DPRModel
@@ -24,6 +26,8 @@ class DocumentGroundedDialogRetrievalModel(TorchModel):
         state_dict = torch.load(
             os.path.join(self.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
             map_location='cpu')
+        compatible_position_ids(state_dict,
+                                'ctx_encoder.encoder.embeddings.position_ids')
         self.model.load_state_dict(state_dict)
 
     def forward(self, input: Dict[str, Tensor], gck_segment=32):
diff --git a/modelscope/models/nlp/task_models/machine_reading_comprehension.py b/modelscope/models/nlp/task_models/machine_reading_comprehension.py
index 034e53ce..add62ce8 100644
--- a/modelscope/models/nlp/task_models/machine_reading_comprehension.py
+++ b/modelscope/models/nlp/task_models/machine_reading_comprehension.py
@@ -16,6 +16,8 @@ from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.task_models.task_model import EncoderModel
 from modelscope.outputs import MachineReadingComprehensionOutput, OutputKeys
+from modelscope.utils.compatible_with_transformers import \
+    compatible_position_ids
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import parse_label_mapping
 
@@ -45,9 +47,10 @@ class ModelForMachineReadingComprehension(TorchModel):
             self.config.hidden_dropout_prob,
             intermediate_hidden_size=self.config.
             projection_intermediate_hidden_size)
-        self.load_state_dict(
-            torch.load(
-                os.path.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)))
+        state_dict = torch.load(
+            os.path.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+        compatible_position_ids(state_dict, 'roberta.embeddings.position_ids')
+        self.load_state_dict(state_dict)
 
     def forward(
         self,
diff --git a/tests/pipelines/test_controllable_image_generation.py b/tests/pipelines/test_controllable_image_generation.py
index c1a29f5b..fa2bb4a1 100644
--- a/tests/pipelines/test_controllable_image_generation.py
+++ b/tests/pipelines/test_controllable_image_generation.py
@@ -25,7 +25,8 @@ class ControllableImageGenerationTest(unittest.TestCase):
             'prompt': 'flower'
         }
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2,
+                         'skip test for huggingface model download issue.')
     def test_run_with_model_from_modelhub(self):
         output_image_path = tempfile.NamedTemporaryFile(suffix='.png').name
         control_types = [
diff --git a/tests/utils/test_hf_util.py b/tests/utils/test_hf_util.py
index fcbaf50c..e16bc6fa 100644
--- a/tests/utils/test_hf_util.py
+++ b/tests/utils/test_hf_util.py
@@ -18,10 +18,10 @@ class HFUtilTest(unittest.TestCase):
 
     def test_auto_tokenizer(self):
         tokenizer = AutoTokenizer.from_pretrained(
-            'baichuan-inc/Baichuan-13B-Chat',
+            'baichuan-inc/Baichuan2-7B-Chat',
             trust_remote_code=True,
             revision='v1.0.3')
-        self.assertEqual(tokenizer.vocab_size, 64000)
+        self.assertEqual(tokenizer.vocab_size, 125696)
         self.assertEqual(tokenizer.model_max_length, 4096)
         self.assertFalse(tokenizer.is_fast)
 

From c039b7348df72be6f19c43fa22551771d9cb7a69 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 24 Oct 2023 15:19:01 +0800
Subject: [PATCH 07/18] force upgrade funasr build image Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14411115 * force
 upgrade funasr build image

---
 .dev_scripts/build_image.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.dev_scripts/build_image.sh b/.dev_scripts/build_image.sh
index 9ce2a4a8..dceaaa22 100644
--- a/.dev_scripts/build_image.sh
+++ b/.dev_scripts/build_image.sh
@@ -150,7 +150,7 @@ echo -e "Building image with:\npython$python_version\npytorch$torch_version\nten
 docker_file_content=`cat docker/Dockerfile.ubuntu`
 if [ "$is_ci_test" != "True" ]; then
     echo "Building ModelScope lib, will install ModelScope lib to image"
-    docker_file_content="${docker_file_content} \nRUN pip install --no-cache-dir -U transformers && pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/releases/build/modelscope-$modelscope_version-py3-none-any.whl "
+    docker_file_content="${docker_file_content} \nRUN pip install --no-cache-dir -U funasr transformers && pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/releases/build/modelscope-$modelscope_version-py3-none-any.whl "
 fi
 echo "$is_dsw"
 if [ "$is_dsw" == "False" ]; then

From 0911283dde26f5d5bce6617eedd0a5cdf4c45d66 Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Fri, 20 Oct 2023 16:10:54 +0800
Subject: [PATCH 08/18] add freeU model

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14307648


* support sd21, sdxl
---
 modelscope/metainfo.py                        |   1 +
 .../models/multi_modal/freeu/__init__.py      |  22 ++
 .../multi_modal/freeu/free_lunch_utils.py     | 331 ++++++++++++++++++
 modelscope/pipelines/multi_modal/__init__.py  |   4 +-
 .../text_to_image_freeu_pipeline.py           | 138 ++++++++
 tests/pipelines/test_text_to_image_freeu.py   |  57 +++
 6 files changed, 552 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/models/multi_modal/freeu/__init__.py
 create mode 100644 modelscope/models/multi_modal/freeu/free_lunch_utils.py
 create mode 100644 modelscope/pipelines/multi_modal/text_to_image_freeu_pipeline.py
 create mode 100644 tests/pipelines/test_text_to_image_freeu.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index ea56efb5..377ade9b 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -291,6 +291,7 @@ class Pipelines(object):
     image_denoise = 'nafnet-image-denoise'
     image_deblur = 'nafnet-image-deblur'
     image_editing = 'masactrl-image-editing'
+    freeu_stable_diffusion_text2image = 'freeu-stable-diffusion-text2image'
     person_image_cartoon = 'unet-person-image-cartoon'
     ocr_detection = 'resnet18-ocr-detection'
     table_recognition = 'dla34-table-recognition'
diff --git a/modelscope/models/multi_modal/freeu/__init__.py b/modelscope/models/multi_modal/freeu/__init__.py
new file mode 100644
index 00000000..3cd55cf3
--- /dev/null
+++ b/modelscope/models/multi_modal/freeu/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .free_lunch_utils import register_free_upblock2d, register_free_crossattn_upblock2d
+else:
+    _import_structure = {
+        'free_lunch_utils':
+        ['register_free_upblock2d', 'register_free_crossattn_upblock2d']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/multi_modal/freeu/free_lunch_utils.py b/modelscope/models/multi_modal/freeu/free_lunch_utils.py
new file mode 100644
index 00000000..eb5d191f
--- /dev/null
+++ b/modelscope/models/multi_modal/freeu/free_lunch_utils.py
@@ -0,0 +1,331 @@
+# ------------------------------------------------------------------------
+# Modified from https://github.com/ChenyangSi/FreeU/blob/main/demo/free_lunch_utils.py
+# Copyright (c) 2023 TencentARC. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.fft as fft
+from diffusers.utils import is_torch_version
+
+
+def isinstance_str(x: object, cls_name: str):
+    """
+    Checks whether x has any class *named* cls_name in its ancestry.
+    Doesn't require access to the class's implementation.
+
+    Useful for patching!
+    """
+
+    for _cls in x.__class__.__mro__:
+        if _cls.__name__ == cls_name:
+            return True
+
+    return False
+
+
+def Fourier_filter(x, threshold, scale):
+    dtype = x.dtype
+    x = x.type(torch.float32)
+    # FFT
+    x_freq = fft.fftn(x, dim=(-2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-2, -1))
+
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W)).cuda()
+
+    crow, ccol = H // 2, W // 2
+    mask[..., crow - threshold:crow + threshold,
+         ccol - threshold:ccol + threshold] = scale
+    x_freq = x_freq * mask
+
+    # IFFT
+    x_freq = fft.ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real
+
+    x_filtered = x_filtered.type(dtype)
+    return x_filtered
+
+
+def register_upblock2d(model):
+
+    def up_forward(self):
+
+        def forward(hidden_states,
+                    res_hidden_states_tuple,
+                    temb=None,
+                    upsample_size=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states],
+                                          dim=1)
+
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module):
+
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+
+                        return custom_forward
+
+                    if is_torch_version('>=', '1.11.0'):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet),
+                            hidden_states,
+                            temb,
+                            use_reentrant=False)
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb)
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        return forward
+
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, 'UpBlock2D'):
+            upsample_block.forward = up_forward(upsample_block)
+
+
+def register_free_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2):
+
+    def up_forward(self):
+
+        def forward(hidden_states,
+                    res_hidden_states_tuple,
+                    temb=None,
+                    upsample_size=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    hidden_states[:, :640] = hidden_states[:, :640] * self.b1
+                    res_hidden_states = Fourier_filter(
+                        res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    hidden_states[:, :320] = hidden_states[:, :320] * self.b2
+                    res_hidden_states = Fourier_filter(
+                        res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+
+                hidden_states = torch.cat([hidden_states, res_hidden_states],
+                                          dim=1)
+
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module):
+
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+
+                        return custom_forward
+
+                    if is_torch_version('>=', '1.11.0'):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet),
+                            hidden_states,
+                            temb,
+                            use_reentrant=False)
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb)
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        return forward
+
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, 'UpBlock2D'):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)
+
+
+def register_crossattn_upblock2d(model):
+
+    def up_forward(self):
+
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states],
+                                          dim=1)
+
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module, return_dict=None):
+
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+
+                        return custom_forward
+
+                    ckpt_kwargs: Dict[str, Any] = {
+                        'use_reentrant': False
+                    } if is_torch_version('>=', '1.11.0') else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        attention_mask=attention_mask,
+                        encoder_attention_mask=encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        return forward
+
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, 'CrossAttnUpBlock2D'):
+            upsample_block.forward = up_forward(upsample_block)
+
+
+def register_free_crossattn_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2):
+
+    def up_forward(self):
+
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    hidden_states[:, :640] = hidden_states[:, :640] * self.b1
+                    res_hidden_states = Fourier_filter(
+                        res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    hidden_states[:, :320] = hidden_states[:, :320] * self.b2
+                    res_hidden_states = Fourier_filter(
+                        res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+
+                hidden_states = torch.cat([hidden_states, res_hidden_states],
+                                          dim=1)
+
+                if self.training and self.gradient_checkpointing:
+
+                    def create_custom_forward(module, return_dict=None):
+
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+
+                        return custom_forward
+
+                    ckpt_kwargs: Dict[str, Any] = {
+                        'use_reentrant': False
+                    } if is_torch_version('>=', '1.11.0') else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )[0]
+
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+
+            return hidden_states
+
+        return forward
+
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, 'CrossAttnUpBlock2D'):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)
diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py
index b5316684..1faa261e 100644
--- a/modelscope/pipelines/multi_modal/__init__.py
+++ b/modelscope/pipelines/multi_modal/__init__.py
@@ -26,6 +26,7 @@ if TYPE_CHECKING:
     from .visual_question_answering_pipeline import VisualQuestionAnsweringPipeline
     from .video_question_answering_pipeline import VideoQuestionAnsweringPipeline
     from .videocomposer_pipeline import VideoComposerPipeline
+    from .text_to_image_freeu_pipeline import FreeUTextToImagePipeline
 else:
     _import_structure = {
         'image_captioning_pipeline': ['ImageCaptioningPipeline'],
@@ -53,7 +54,8 @@ else:
         ['SOONetVideoTemporalGroundingPipeline'],
         'text_to_video_synthesis_pipeline': ['TextToVideoSynthesisPipeline'],
         'multimodal_dialogue_pipeline': ['MultimodalDialoguePipeline'],
-        'videocomposer_pipeline': ['VideoComposerPipeline']
+        'videocomposer_pipeline': ['VideoComposerPipeline'],
+        'text_to_image_freeu_pipeline': ['FreeUTextToImagePipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/multi_modal/text_to_image_freeu_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_freeu_pipeline.py
new file mode 100644
index 00000000..9300554c
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/text_to_image_freeu_pipeline.py
@@ -0,0 +1,138 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal.freeu import (
+    register_free_crossattn_upblock2d, register_free_upblock2d)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['FreeUTextToImagePipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.text_to_image_synthesis,
+    module_name=Pipelines.freeu_stable_diffusion_text2image)
+class FreeUTextToImagePipeline(Pipeline):
+
+    def __init__(self, model=str, preprocessor=None, **kwargs):
+        """  FreeU Text to Image Pipeline.
+
+        Examples:
+
+        >>> import cv2
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.utils.constant import Tasks
+
+        >>> prompt = "a photo of a running corgi"  # prompt
+        >>> output_image_path = './result.png'
+        >>> inputs = {'prompt': prompt}
+        >>>
+        >>> pipe = pipeline(
+        >>>     Tasks.text_to_image_synthesis,
+        >>>     model='damo/multi-modal_freeu_stable_diffusion',
+        >>>     base_model='AI-ModelScope/stable-diffusion-v1-5',
+        >>> )
+        >>>
+        >>> output = pipe(inputs)['output_imgs']
+        >>> cv2.imwrite(output_image_path, output)
+        >>> print('pipeline: the output image path is {}'.format(output_image_path))
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        torch_dtype = kwargs.get('torch_dtype', torch.float32)
+        self._device = getattr(
+            kwargs, 'device',
+            torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+        base_model = kwargs.get(
+            'base_model', 'AI-ModelScope/stable-diffusion-v1-5')  # default 1.5
+        self.freeu_params = kwargs.get('freeu_params', {
+            'b1': 1.5,
+            'b2': 1.6,
+            's1': 0.9,
+            's2': 0.2
+        })  # default
+
+        logger.info('load freeu stable diffusion text to image pipeline done')
+        self.pipeline = pipeline(
+            task=Tasks.text_to_image_synthesis,
+            model=base_model,
+            torch_dtype=torch_dtype,
+            device=self._device).pipeline
+
+    def preprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return inputs
+
+    def forward(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        """
+        Inputs Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+        """
+        if not isinstance(inputs, dict):
+            raise ValueError(
+                f'Expected the input to be a dictionary, but got {type(inputs)}'
+            )
+        # -------- freeu block registration
+        register_free_upblock2d(self.pipeline, **self.freeu_params)
+        register_free_crossattn_upblock2d(self.pipeline, **self.freeu_params)
+        # -------- freeu block registration
+
+        output = self.pipeline(
+            prompt=inputs.get('prompt'),
+            height=inputs.get('height'),
+            width=inputs.get('width'),
+            num_inference_steps=inputs.get('num_inference_steps', 50),
+            guidance_scale=inputs.get('guidance_scale', 7.5),
+            negative_prompt=inputs.get('negative_prompt'),
+            num_images_per_prompt=inputs.get('num_images_per_prompt', 1),
+            eta=inputs.get('eta', 0.0),
+            generator=inputs.get('generator'),
+            latents=inputs.get('latents'),
+        ).images[0]
+
+        return {'output_tensor': output}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_img = np.array(inputs['output_tensor'])
+        return {OutputKeys.OUTPUT_IMGS: output_img[:, :, ::-1]}
diff --git a/tests/pipelines/test_text_to_image_freeu.py b/tests/pipelines/test_text_to_image_freeu.py
new file mode 100644
index 00000000..7aebe318
--- /dev/null
+++ b/tests/pipelines/test_text_to_image_freeu.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.multi_modal import FreeUTextToImagePipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ImageEditingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_image_synthesis
+        self.model_id = 'damo/multi-modal_freeu_stable_diffusion'
+        prompt = 'a photo of a running corgi'  # prompt
+        self.inputs = {'prompt': prompt}
+        self.output_image_path = './result.png'
+        self.base_model = 'AI-ModelScope/stable-diffusion-v2-1'
+        self.freeu_params = {
+            'b1': 1.4,
+            'b2': 1.6,
+            's1': 0.9,
+            's2': 0.2
+        }  # for SD2.1
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = FreeUTextToImagePipeline(cache_path)
+        pipeline.group_key = self.task
+        synthesized_img = pipeline(
+            input=self.inputs)[OutputKeys.OUTPUT_IMGS]  # BGR
+        cv2.imwrite(self.output_image_path, synthesized_img)
+        print('FreeU pipeline: the synthesized image path is {}'.format(
+            self.output_image_path))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text_to_image_synthesis,
+            model=self.model_id,
+            base_model=self.base_model,
+            freeu_params=self.freeu_params)
+        synthesized_img = pipeline_ins(
+            self.inputs)[OutputKeys.OUTPUT_IMGS]  # BGR
+        cv2.imwrite(self.output_image_path, synthesized_img)
+        print('FreeU pipeline: the synthesized image path is {}'.format(
+            self.output_image_path))
+
+
+if __name__ == '__main__':
+    unittest.main()

From ebd6ddb530a5ae0d37045c9f1d686bbff7fbdf73 Mon Sep 17 00:00:00 2001
From: "zhangyanzhao.zyz" <zhangyanzhao.zyz@alibaba-inc.com>
Date: Fri, 20 Oct 2023 19:56:01 +0800
Subject: [PATCH 09/18] =?UTF-8?q?=E6=9B=B4=E6=96=B0sentence=20embedding=20?=
 =?UTF-8?q?model=EF=BC=8C=E6=94=AF=E6=8C=81gte=EF=BC=8Cbloom=20sentence=20?=
 =?UTF-8?q?embedding?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14375781

* fix linter

* bloom embedding
---
 .../models/nlp/bert/sentence_embedding.py     |  15 +-
 modelscope/models/nlp/bloom/__init__.py       |   2 +
 .../models/nlp/bloom/sentence_embedding.py    | 165 ++++++++++++++++++
 .../nlp/sentence_embedding_preprocessor.py    | 103 ++++++++++-
 tests/pipelines/test_sentence_embedding.py    |   9 +
 5 files changed, 286 insertions(+), 8 deletions(-)
 create mode 100644 modelscope/models/nlp/bloom/sentence_embedding.py

diff --git a/modelscope/models/nlp/bert/sentence_embedding.py b/modelscope/models/nlp/bert/sentence_embedding.py
index 92a9da50..b7df5ef9 100644
--- a/modelscope/models/nlp/bert/sentence_embedding.py
+++ b/modelscope/models/nlp/bert/sentence_embedding.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 
 from modelscope.metainfo import Models
@@ -61,8 +62,9 @@ class BertForSentenceEmbedding(BertPreTrainedModel):
     def __init__(self, config, **kwargs):
         super().__init__(config)
         self.config = config
-        self.pooler_type = kwargs.get('pooler_type', 'cls')
+        self.pooler_type = kwargs.get('emb_pooler_type', 'cls')
         self.pooler = Pooler(self.pooler_type)
+        self.normalize = kwargs.get('normalize', False)
         setattr(self, self.base_model_prefix,
                 BertModel(config, add_pooling_layer=False))
 
@@ -128,6 +130,8 @@ class BertForSentenceEmbedding(BertPreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict)
         outputs = self.pooler(outputs, attention_mask)
+        if self.normalize:
+            outputs = F.normalize(outputs, p=2, dim=-1)
         return outputs
 
     @classmethod
@@ -142,8 +146,11 @@ class BertForSentenceEmbedding(BertPreTrainedModel):
             The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
         """
         model_dir = kwargs.get('model_dir')
-        model = super(
-            Model,
-            cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        model_kwargs = {
+            'emb_pooler_type': kwargs.get('emb_pooler_type', 'cls'),
+            'normalize': kwargs.get('normalize', False)
+        }
+        model = super(Model, cls).from_pretrained(
+            pretrained_model_name_or_path=model_dir, **model_kwargs)
         model.model_dir = model_dir
         return model
diff --git a/modelscope/models/nlp/bloom/__init__.py b/modelscope/models/nlp/bloom/__init__.py
index b0f04af7..24d7202d 100644
--- a/modelscope/models/nlp/bloom/__init__.py
+++ b/modelscope/models/nlp/bloom/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .backbone import BloomModel
     from .text_generation import BloomForTextGeneration
+    from .sentence_embedding import BloomForSentenceEmbedding
 else:
     _import_structure = {
         'backbone': ['BloomModel'],
         'text_generation': ['BloomForTextGeneration'],
+        'sentence_embedding': ['BloomForSentenceEmbedding']
     }
     import sys
     sys.modules[__name__] = LazyImportModule(
diff --git a/modelscope/models/nlp/bloom/sentence_embedding.py b/modelscope/models/nlp/bloom/sentence_embedding.py
new file mode 100644
index 00000000..ec35db38
--- /dev/null
+++ b/modelscope/models/nlp/bloom/sentence_embedding.py
@@ -0,0 +1,165 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+from transformers import BloomConfig
+from transformers import BloomModel as BloomModelTransform
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.outputs import SentencEmbeddingModelOutput
+from modelscope.utils.constant import Tasks
+
+
+class DecoderPooler(torch.nn.Module):
+    """
+    Parameter-free poolers to get the sentence embedding
+    'last': the last token state.
+    'weighted_mean': position weighted average of all token states.
+    """
+
+    def __init__(self, pooler_type):
+        super().__init__()
+        self.pooler_type = pooler_type
+        assert self.pooler_type in [
+            'last', 'weighted_mean'
+        ], 'unrecognized pooling type %s' % self.pooler_type
+
+    def forward(self, outputs, attention_mask):
+        last_hidden = outputs.last_hidden_state
+
+        if self.pooler_type in ['last']:
+            n, l, h = last_hidden.shape
+
+            # Get shape [n] indices of the last token (i.e. the last token for each batch item)
+            # Any sequence where min == 1, we use the entire sequence lenth since argmin = 0
+            values, indices = torch.min(attention_mask, 1, keepdim=False)
+            gather_indices = torch.where(values == 0, indices,
+                                         l) - 1  # Shape [n]
+
+            # There are empty sequences, where the index would become -1 which will crash
+            gather_indices = torch.clamp(gather_indices, min=0)
+
+            # Turn indices from shape [n] --> [n, 1, h]
+            gather_indices = gather_indices.unsqueeze(1).unsqueeze(1).expand(
+                n, 1, h)
+
+            # Gather along the 1st dim (l) (n, l, h -> n, h)
+            pooled_output = torch.gather(last_hidden, 1,
+                                         gather_indices).squeeze(dim=1)
+
+        elif self.pooler_type == 'weighted_mean':
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(
+                last_hidden.size()).float()
+            # last_hidden shape: bs, seq, hidden_dim
+            weights = (
+                torch.arange(start=1, end=last_hidden.shape[1]
+                             + 1).unsqueeze(0).unsqueeze(-1).expand(
+                                 last_hidden.size()).float().to(
+                                     last_hidden.device))
+            assert weights.shape == last_hidden.shape == input_mask_expanded.shape
+            input_mask_expanded = input_mask_expanded * weights
+
+            sum_embeddings = torch.sum(last_hidden * input_mask_expanded, 1)
+            sum_mask = input_mask_expanded.sum(1)
+            sum_mask = torch.clamp(sum_mask, min=1e-9)
+            pooled_output = sum_embeddings / sum_mask
+
+        else:
+            raise NotImplementedError
+
+        return pooled_output
+
+
+@MODELS.register_module(
+    group_key=Tasks.sentence_embedding, module_name=Models.bloom)
+class BloomForSentenceEmbedding(BloomModelTransform, TorchModel):
+    r"""
+    This model represent a text to a dense vector by the last token state or weighted mean of all token states.
+    See `Language Models are Universal Embedders
+    <https://arxiv.org/pdf/2310.08232.pdf>`_ for details.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.pooler_type = kwargs.get('emb_pooler_type', 'weighted_mean')
+        self.pooler = DecoderPooler(self.pooler_type)
+        self.normalize = kwargs.get('normalize', False)
+        setattr(self, self.base_model_prefix, BloomModelTransform(config))
+
+    def forward(self, query=None, docs=None, labels=None):
+        r"""
+        Args:
+            query (:obj: `dict`): Dict of pretrained models's input for the query sequence. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+            docs (:obj: `dict`): Dict of pretrained models's input for the query sequence. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+        Returns:
+            Returns `modelscope.outputs.SentencEmbeddingModelOutput
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_udever_bloom_560m')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_udever_bloom_560m')
+            >>> inputs = preprocessor({'source_sentence': ['This is a test']})
+            >>> outputs = model(**inputs)
+            >>> print(outputs)
+        """
+        query_embeddings, doc_embeddings = None, None
+        if query is not None:
+            query_embeddings = self.encode(**query)
+        if docs is not None:
+            doc_embeddings = self.encode(**docs)
+        outputs = SentencEmbeddingModelOutput(
+            query_embeddings=query_embeddings, doc_embeddings=doc_embeddings)
+        if query_embeddings is None or doc_embeddings is None:
+            return outputs
+        if self.base_model.training:
+            loss_fct = torch.nn.CrossEntropyLoss()
+            scores = torch.matmul(query_embeddings, doc_embeddings.T)
+            if labels is None:
+                labels = torch.arange(
+                    scores.size(0), device=scores.device, dtype=torch.long)
+                labels = labels * (
+                    doc_embeddings.size(0) // query_embeddings.size(0))
+            loss = loss_fct(scores, labels)
+            outputs.loss = loss
+        return outputs
+
+    def encode(
+        self,
+        input_ids=None,
+        attention_mask=None,
+    ):
+        outputs = self.base_model.forward(
+            input_ids, attention_mask=attention_mask)
+        embeddings = self.pooler(outputs, attention_mask)
+        if self.normalize:
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
+        return embeddings
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_dir = kwargs.get('model_dir')
+        model_kwargs = {
+            'emb_pooler_type': kwargs.get('emb_pooler_type', 'weighted_mean'),
+            'normalize': kwargs.get('normalize', False)
+        }
+        if model_dir is None:
+            config = BloomConfig(**kwargs)
+            model = cls(config)
+        else:
+            model = super(BloomModelTransform, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        model.model_dir = model_dir
+        return model
diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
index b03268c6..f1ca6685 100644
--- a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
+++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
@@ -1,14 +1,19 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict
+from typing import Any, Dict, Optional
+
+import torch
 
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.hub import get_model_type
+from modelscope.utils.logger import get_logger
 from .transformers_tokenizer import NLPTokenizer
 
+logger = get_logger()
+
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sentence_embedding)
@@ -46,9 +51,32 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
         self.max_length = max_length
         if model_dir is not None:
             model_type = get_model_type(model_dir)
+        # we could add `boq/bod` token/prompt and `eoq/eod` token if they exist when tokenizing.
+        for k in ('boq', 'eoq', 'bod', 'eod'):
+            setattr(self, k, kwargs.pop(k, None))
         self.nlp_tokenizer = NLPTokenizer(
             model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
         super().__init__(mode=mode)
+        tokenizer = self.nlp_tokenizer.tokenizer
+        # For tokenizers like bloom
+        if tokenizer.padding_side != 'right':
+            # weighted mean pooling need pad right
+            logger.warning(
+                f'Change tokenizer.padding_side from {tokenizer.padding_side} to right'
+            )
+            tokenizer.padding_side = 'right'
+        # For decoder-only tokenizers
+        if tokenizer.pad_token is None:
+            logger.warning(
+                f'Set tokenizer.pad_token as eos_token {tokenizer.eos_token}')
+            tokenizer.pad_token = tokenizer.eos_token
+        # Currently eos is single token, we can extend to prompt later.
+        for k in ('eoq', 'eod'):
+            v = getattr(self, k, None)
+            if v is not None:
+                v = tokenizer.convert_tokens_to_ids(v)
+            setattr(self, k + '_id', v)
+        self.pad_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
 
     def __call__(self,
                  data: Dict,
@@ -81,13 +109,80 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
         if 'return_tensors' not in kwargs:
             kwargs[
                 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
-        query_inputs = self.nlp_tokenizer(
-            source_sentences, padding=padding, truncation=truncation, **kwargs)
+        query_inputs = self.tokenize(
+            source_sentences,
+            is_query=True,
+            padding=padding,
+            truncation=truncation,
+            **kwargs)
         tokenized_inputs = {'query': query_inputs, 'docs': None}
         if compare_sentences is not None and len(compare_sentences) > 0:
-            tokenized_inputs['docs'] = self.nlp_tokenizer(
+            tokenized_inputs['docs'] = self.tokenize(
                 compare_sentences,
+                is_query=kwargs.get('symmetric', False),
                 padding=padding,
                 truncation=truncation,
                 **kwargs)
         return tokenized_inputs
+
+    def tokenize(self, texts, is_query=True, return_tensors=None, **kwargs):
+        """Tokenize raw texts, add `boq/bod` token/prompt and `eoq/eod` token if they exist.
+
+        Args:
+            `texts` List[str]: texts to tokenize,
+                Example:
+                    ["how long it take to get a master's degree"]
+            `is_query` bool: whether the input text(s) is query.
+            `return_tensors` str: the `return_tensors` argument to tokenizer.
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        if is_query:
+            bos, eos_id = self.boq, self.eoq_id
+        else:
+            bos, eos_id = self.bod, self.eod_id
+        if bos is not None:
+            # bos can be prompt
+            texts = [bos + t for t in texts]
+        encoding = self.nlp_tokenizer(
+            texts, return_tensors=return_tensors, **kwargs)
+        if eos_id is not None:
+            if return_tensors == 'pt':
+                self.add_eos_pt(encoding, eos_id)
+            else:
+                self.add_eos(encoding, eos_id)
+        return encoding
+
+    def add_eos_pt(self, encoding: Dict[str, torch.Tensor], eos: int):
+        """Add `eos` token id to the end of each sequence."""
+        input_ids, attn_mask = encoding['input_ids'], encoding[
+            'attention_mask']
+        batch = torch.arange(input_ids.size(0))
+        length = attn_mask.sum(-1)
+
+        if input_ids.size(1) < self.max_length:
+            ones = input_ids.new_ones(input_ids.size(0), 1)
+            attn_mask = torch.cat((ones, attn_mask), dim=1)
+            padding = ones * self.pad_id
+            input_ids = torch.cat((input_ids, padding), dim=1)
+            eos_index = length
+        else:
+            eos_index = torch.clamp(length, max=self.max_length - 1)
+            attn_mask[batch, eos_index] = 1
+        input_ids[batch, eos_index] = eos
+        encoding['input_ids'], encoding[
+            'attention_mask'] = input_ids, attn_mask
+        return
+
+    def add_eos(self, encoding: Dict[str, list], eos: int):
+        """Add `eos` token id to the end of each sequence."""
+        for ids, mask in zip(encoding['input_ids'],
+                             encoding['attention_mask']):
+            if len(mask) < self.max_length:
+                ids.append(eos)
+                mask.append(1)
+            else:
+                last = min(sum(mask), self.max_length - 1)
+                ids[last] = eos
+                mask[last] = 1
+        return
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
index 13260132..a6dd89ec 100644
--- a/tests/pipelines/test_sentence_embedding.py
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -21,6 +21,7 @@ class SentenceEmbeddingTest(unittest.TestCase):
     medical_tiny_model_id = 'damo/nlp_corom_sentence-embedding_chinese-tiny-medical'
     general_base_model_id = 'damo/nlp_corom_sentence-embedding_chinese-base'
     general_tiny_model_id = 'damo/nlp_corom_sentence-embedding_chinese-tiny'
+    bloom_model_id = 'damo/udever-bloom-7b1'
 
     inputs = {
         'source_sentence': ["how long it take to get a master's degree"],
@@ -154,6 +155,14 @@ class SentenceEmbeddingTest(unittest.TestCase):
         print()
         print(f'pipeline2: {pipeline2(input=self.medical_inputs1)}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_bloom_model_from_modelhub(self):
+        model = Model.from_pretrained(self.bloom_model_id)
+        tokenizer = SentenceEmbeddingTransformersPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)

From 107975620558cbc0957e7fabbd76bdc3f3a033ce Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 24 Oct 2023 20:13:05 +0800
Subject: [PATCH 10/18] bump version to 1.9.4

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 0ec59aaa..fb0e01f3 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1,5 +1,5 @@
 # Make sure to modify __release_datetime__ to release time when making official release.
-__version__ = '1.9.3'
+__version__ = '1.9.4'
 # default release datetime for branches under active development is set
 # to be a time far-far-away-into-the-future
 __release_datetime__ = '2099-09-06 00:00:00'

From e8b34a3bcdc2d0d919b2156a81193f5c58bc086d Mon Sep 17 00:00:00 2001
From: "chenyafeng.cyf" <chenyafeng.cyf@alibaba-inc.com>
Date: Wed, 25 Oct 2023 20:36:36 +0800
Subject: [PATCH 11/18] add_eres2net_speaker_diarization

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14412468
* add_eres2net_speaker_diarization

* fix lint issue

* update code

* update pipeline
---
 modelscope/metainfo.py                        |   1 +
 modelscope/models/audio/sv/ERes2Net.py        |  14 +-
 modelscope/models/audio/sv/ERes2Net_aug.py    |  14 +-
 modelscope/models/audio/sv/TDNN.py            | 303 ++++++++++++++++
 .../sv/speaker_change_locator_xvector.py      | 329 ++++++++++++++++++
 .../audio/segmentation_clustering_pipeline.py |   5 +-
 .../speaker_verification_eres2net_pipeline.py | 121 +++++--
 .../speaker_verification_light_pipeline.py    |   5 +-
 tests/pipelines/test_speaker_verification.py  |  25 +-
 9 files changed, 769 insertions(+), 48 deletions(-)
 create mode 100644 modelscope/models/audio/sv/TDNN.py
 create mode 100644 modelscope/models/audio/sv/speaker_change_locator_xvector.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 377ade9b..87d5f312 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -200,6 +200,7 @@ class Models(object):
     eres2net_sv = 'eres2net-sv'
     eres2net_aug_sv = 'eres2net-aug-sv'
     scl_sd = 'scl-sd'
+    scl_sd_xvector = 'scl-sd-xvector'
     campplus_lre = 'cam++-lre'
     eres2net_lre = 'eres2net-lre'
     cluster_backend = 'cluster-backend'
diff --git a/modelscope/models/audio/sv/ERes2Net.py b/modelscope/models/audio/sv/ERes2Net.py
index 54f81ac9..0119783c 100644
--- a/modelscope/models/audio/sv/ERes2Net.py
+++ b/modelscope/models/audio/sv/ERes2Net.py
@@ -8,6 +8,7 @@ import math
 import os
 from typing import Any, Dict, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -323,13 +324,18 @@ class SpeakerVerificationERes2Net(TorchModel):
         self.embedding_model.eval()
 
     def forward(self, audio):
-        assert len(audio.shape) == 2 and audio.shape[
-            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
-        # audio shape: [1, T]
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)
+        assert len(
+            audio.shape
+        ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]'
+        # audio shape: [N, T]
         feature = self.__extract_feature(audio)
         embedding = self.embedding_model(feature)
 
-        return embedding
+        return embedding.detach().cpu()
 
     def __extract_feature(self, audio):
         feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
diff --git a/modelscope/models/audio/sv/ERes2Net_aug.py b/modelscope/models/audio/sv/ERes2Net_aug.py
index 08f1a8a1..d0739cad 100644
--- a/modelscope/models/audio/sv/ERes2Net_aug.py
+++ b/modelscope/models/audio/sv/ERes2Net_aug.py
@@ -8,6 +8,7 @@ import math
 import os
 from typing import Any, Dict, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -316,13 +317,18 @@ class SpeakerVerificationERes2Net(TorchModel):
         self.embedding_model.eval()
 
     def forward(self, audio):
-        assert len(audio.shape) == 2 and audio.shape[
-            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
-        # audio shape: [1, T]
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)
+        assert len(
+            audio.shape
+        ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]'
+        # audio shape: [N, T]
         feature = self.__extract_feature(audio)
         embedding = self.embedding_model(feature)
 
-        return embedding
+        return embedding.detach().cpu()
 
     def __extract_feature(self, audio):
         feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
diff --git a/modelscope/models/audio/sv/TDNN.py b/modelscope/models/audio/sv/TDNN.py
new file mode 100644
index 00000000..9cc35c1f
--- /dev/null
+++ b/modelscope/models/audio/sv/TDNN.py
@@ -0,0 +1,303 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Conv1d_O(nn.Module):
+
+    def __init__(
+        self,
+        out_channels,
+        kernel_size,
+        input_shape=None,
+        in_channels=None,
+        stride=1,
+        dilation=1,
+        padding='same',
+        groups=1,
+        bias=True,
+        padding_mode='reflect',
+        skip_transpose=False,
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.unsqueeze = False
+        self.skip_transpose = skip_transpose
+
+        if input_shape is None and in_channels is None:
+            raise ValueError('Must provide one of input_shape or in_channels')
+
+        if in_channels is None:
+            in_channels = self._check_input_shape(input_shape)
+
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            padding=0,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        """Returns the output of the convolution.
+
+        Arguments
+        ---------
+        x : torch.Tensor (batch, time, channel)
+            input to convolve. 2d or 4d tensors are expected.
+        """
+
+        if not self.skip_transpose:
+            x = x.transpose(1, -1)
+
+        if self.unsqueeze:
+            x = x.unsqueeze(1)
+
+        if self.padding == 'same':
+            x = self._manage_padding(x, self.kernel_size, self.dilation,
+                                     self.stride)
+
+        elif self.padding == 'causal':
+            num_pad = (self.kernel_size - 1) * self.dilation
+            x = F.pad(x, (num_pad, 0))
+
+        elif self.padding == 'valid':
+            pass
+
+        else:
+            raise ValueError(
+                "Padding must be 'same', 'valid' or 'causal'. Got "
+                + self.padding)
+
+        wx = self.conv(x)
+
+        if self.unsqueeze:
+            wx = wx.squeeze(1)
+
+        if not self.skip_transpose:
+            wx = wx.transpose(1, -1)
+
+        return wx
+
+    def _manage_padding(
+        self,
+        x,
+        kernel_size: int,
+        dilation: int,
+        stride: int,
+    ):
+        # Detecting input shape
+        L_in = x.shape[-1]
+
+        # Time padding
+        padding = get_padding_elem(L_in, stride, kernel_size, dilation)
+
+        # Applying padding
+        x = F.pad(x, padding, mode=self.padding_mode)
+
+        return x
+
+    def _check_input_shape(self, shape):
+        """Checks the input shape and returns the number of input channels.
+        """
+
+        if len(shape) == 2:
+            self.unsqueeze = True
+            in_channels = 1
+        elif self.skip_transpose:
+            in_channels = shape[1]
+        elif len(shape) == 3:
+            in_channels = shape[2]
+        else:
+            raise ValueError('conv1d expects 2d, 3d inputs. Got '
+                             + str(len(shape)))
+
+        # Kernel size must be odd
+        if self.kernel_size % 2 == 0:
+            raise ValueError(
+                'The field kernel size must be an odd number. Got %s.' %
+                (self.kernel_size))
+        return in_channels
+
+
+# Skip transpose as much as possible for efficiency
+class Conv1d(Conv1d_O):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(skip_transpose=True, *args, **kwargs)
+
+
+def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
+    """This function computes the number of elements to add for zero-padding.
+
+    Arguments
+    ---------
+    L_in : int
+    stride: int
+    kernel_size : int
+    dilation : int
+    """
+    if stride > 1:
+        n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
+        L_out = stride * (n_steps - 1) + kernel_size * dilation
+        padding = [kernel_size // 2, kernel_size // 2]
+
+    else:
+        L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
+
+        padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
+    return padding
+
+
+class BatchNorm1d_O(nn.Module):
+
+    def __init__(
+        self,
+        input_shape=None,
+        input_size=None,
+        eps=1e-05,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        combine_batch_time=False,
+        skip_transpose=False,
+    ):
+        super().__init__()
+        self.combine_batch_time = combine_batch_time
+        self.skip_transpose = skip_transpose
+
+        if input_size is None and skip_transpose:
+            input_size = input_shape[1]
+        elif input_size is None:
+            input_size = input_shape[-1]
+
+        self.norm = nn.BatchNorm1d(
+            input_size,
+            eps=eps,
+            momentum=momentum,
+            affine=affine,
+            track_running_stats=track_running_stats,
+        )
+
+    def forward(self, x):
+        """Returns the normalized input tensor.
+
+        Arguments
+        ---------
+        x : torch.Tensor (batch, time, [channels])
+            input to normalize. 2d or 3d tensors are expected in input
+            4d tensors can be used when combine_dims=True.
+        """
+        shape_or = x.shape
+        if self.combine_batch_time:
+            if x.ndim == 3:
+                x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
+            else:
+                x = x.reshape(shape_or[0] * shape_or[1], shape_or[3],
+                              shape_or[2])
+
+        elif not self.skip_transpose:
+            x = x.transpose(-1, 1)
+
+        x_n = self.norm(x)
+
+        if self.combine_batch_time:
+            x_n = x_n.reshape(shape_or)
+        elif not self.skip_transpose:
+            x_n = x_n.transpose(1, -1)
+
+        return x_n
+
+
+class BatchNorm1d(BatchNorm1d_O):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(skip_transpose=True, *args, **kwargs)
+
+
+class Xvector(torch.nn.Module):
+    """This model extracts X-vectors for speaker recognition and diarization.
+
+    Arguments
+    ---------
+    device : str
+        Device used e.g. "cpu" or "cuda".
+    activation : torch class
+        A class for constructing the activation layers.
+    tdnn_blocks : int
+        Number of time-delay neural (TDNN) layers.
+    tdnn_channels : list of ints
+        Output channels for TDNN layer.
+    tdnn_kernel_sizes : list of ints
+        List of kernel sizes for each TDNN layer.
+    tdnn_dilations : list of ints
+        List of dilations for kernels in each TDNN layer.
+    lin_neurons : int
+        Number of neurons in linear layers.
+
+    Example
+    -------
+    >>> compute_xvect = Xvector('cpu')
+    >>> input_feats = torch.rand([5, 10, 40])
+    >>> outputs = compute_xvect(input_feats)
+    >>> outputs.shape
+    torch.Size([5, 1, 512])
+    """
+
+    def __init__(
+        self,
+        device='cpu',
+        activation=torch.nn.LeakyReLU,
+        tdnn_blocks=5,
+        tdnn_channels=[512, 512, 512, 512, 1500],
+        tdnn_kernel_sizes=[5, 3, 3, 1, 1],
+        tdnn_dilations=[1, 2, 3, 1, 1],
+        lin_neurons=512,
+        in_channels=80,
+    ):
+
+        super().__init__()
+        self.blocks = nn.ModuleList()
+
+        # TDNN layers
+        for block_index in range(tdnn_blocks):
+            out_channels = tdnn_channels[block_index]
+            self.blocks.extend([
+                Conv1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=tdnn_kernel_sizes[block_index],
+                    dilation=tdnn_dilations[block_index],
+                ),
+                activation(),
+                BatchNorm1d(input_size=out_channels),
+            ])
+            in_channels = tdnn_channels[block_index]
+
+    def forward(self, x, lens=None):
+        """Returns the x-vectors.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+        """
+
+        x = x.transpose(1, 2)
+
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lens)
+            except TypeError:
+                x = layer(x)
+        x = x.transpose(1, 2)
+        return x
diff --git a/modelscope/models/audio/sv/speaker_change_locator_xvector.py b/modelscope/models/audio/sv/speaker_change_locator_xvector.py
new file mode 100644
index 00000000..72e7b3d0
--- /dev/null
+++ b/modelscope/models/audio/sv/speaker_change_locator_xvector.py
@@ -0,0 +1,329 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from collections import OrderedDict
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.models.audio.sv.TDNN import Xvector
+from modelscope.utils.constant import Tasks
+from modelscope.utils.device import create_device
+
+
+class MultiHeadSelfAttention(nn.Module):
+
+    def __init__(self, n_units, h=8, dropout=0.1):
+        super(MultiHeadSelfAttention, self).__init__()
+        self.linearQ = nn.Linear(n_units, n_units)
+        self.linearK = nn.Linear(n_units, n_units)
+        self.linearV = nn.Linear(n_units, n_units)
+        self.linearO = nn.Linear(n_units, n_units)
+        self.d_k = n_units // h
+        self.h = h
+        self.dropout = nn.Dropout(p=dropout)
+        self.att = None
+
+    def forward(self, x, batch_size):
+        # x: (BT, F)
+        q = self.linearQ(x).reshape(batch_size, -1, self.h, self.d_k)
+        k = self.linearK(x).reshape(batch_size, -1, self.h, self.d_k)
+        v = self.linearV(x).reshape(batch_size, -1, self.h, self.d_k)
+        scores = torch.matmul(q.transpose(1, 2), k.permute(
+            0, 2, 3, 1)) / np.sqrt(self.d_k)
+        # scores: (B, h, T, T)
+        self.att = F.softmax(scores, dim=3)
+        p_att = self.dropout(self.att)
+        # v : (B, T, h, d_k)
+        # p_att : (B, h, T, T)
+        x = torch.matmul(p_att, v.transpose(1, 2))
+        # x : (B, h, T, d_k)
+        x = x.transpose(1, 2).reshape(-1, self.h * self.d_k)
+        return self.linearO(x)
+
+
+class PositionwiseFeedForward(nn.Module):
+
+    def __init__(self, n_units, d_units, dropout):
+        super(PositionwiseFeedForward, self).__init__()
+        self.linear1 = nn.Linear(n_units, d_units)
+        self.linear2 = nn.Linear(d_units, n_units)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x):
+        return self.linear2(self.dropout(F.relu(self.linear1(x))))
+
+
+class PosEncoding(nn.Module):
+
+    def __init__(self, max_seq_len, d_word_vec):
+        super(PosEncoding, self).__init__()
+        pos_enc = np.array([[
+            pos / np.power(10000, 2.0 * (j // 2) / d_word_vec)
+            for j in range(d_word_vec)
+        ] for pos in range(max_seq_len)])
+        pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2])
+        pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2])
+        pad_row = np.zeros([1, d_word_vec])
+        pos_enc = np.concatenate([pad_row, pos_enc]).astype(np.float32)
+
+        self.pos_enc = torch.nn.Embedding(max_seq_len + 1, d_word_vec)
+        self.pos_enc.weight = torch.nn.Parameter(
+            torch.from_numpy(pos_enc), requires_grad=False)
+
+    def forward(self, input_len):
+        max_len = torch.max(input_len)
+        input_pos = torch.LongTensor([
+            list(range(1, len + 1)) + [0] * (max_len - len)
+            for len in input_len
+        ])
+
+        input_pos = input_pos.to(list(self.pos_enc.parameters())[0].device)
+        return self.pos_enc(input_pos)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self,
+                 idim,
+                 n_units=256,
+                 n_layers=2,
+                 e_units=512,
+                 h=4,
+                 dropout=0.1):
+        super(TransformerEncoder, self).__init__()
+        self.linear_in = nn.Linear(idim, n_units)
+        self.lnorm_in = nn.LayerNorm(n_units)
+
+        self.n_layers = n_layers
+        self.dropout = nn.Dropout(p=dropout)
+        for i in range(n_layers):
+            setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
+            setattr(self, '{}{:d}'.format('self_att_', i),
+                    MultiHeadSelfAttention(n_units, h))
+            setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
+            setattr(self, '{}{:d}'.format('ff_', i),
+                    PositionwiseFeedForward(n_units, e_units, dropout))
+        self.lnorm_out = nn.LayerNorm(n_units)
+
+    def forward(self, x):
+        # x: [B, num_anchors, T, n_in]
+        bs, num, tframe, dim = x.size()
+        x = x.reshape(bs * num, tframe, -1)  # [B*num_anchors, T, dim]
+        # x: (B, T, F) ... batch, time, (mel)freq
+        B_size, T_size, _ = x.shape
+        # e: (BT, F)
+        e = self.linear_in(x.reshape(B_size * T_size, -1))
+        # Encoder stack
+        for i in range(self.n_layers):
+            # layer normalization
+            e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
+            # self-attention
+            s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
+            # residual
+            e = e + self.dropout(s)
+            # layer normalization
+            e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
+            # positionwise feed-forward
+            s = getattr(self, '{}{:d}'.format('ff_', i))(e)
+            # residual
+            e = e + self.dropout(s)
+        # final layer normalization
+        # output: (BT, F)
+        # output: (B, F, T)
+        output = self.lnorm_out(e).reshape(B_size, T_size, -1)
+        output = output.reshape(bs, num, tframe,
+                                -1)  # [B, num_anchors, T, dim]
+        return output
+
+
+class TransformerEncoder_out(nn.Module):
+
+    def __init__(self,
+                 idim,
+                 n_units=256,
+                 n_layers=2,
+                 e_units=512,
+                 h=4,
+                 dropout=0.1):
+        super(TransformerEncoder_out, self).__init__()
+        self.linear_in = nn.Linear(idim, n_units)
+        self.lnorm_in = nn.LayerNorm(n_units)
+
+        self.n_layers = n_layers
+        self.dropout = nn.Dropout(p=dropout)
+        for i in range(n_layers):
+            setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
+            setattr(self, '{}{:d}'.format('self_att_', i),
+                    MultiHeadSelfAttention(n_units, h))
+            setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
+            setattr(self, '{}{:d}'.format('ff_', i),
+                    PositionwiseFeedForward(n_units, e_units, dropout))
+        self.lnorm_out = nn.LayerNorm(n_units)
+
+    def forward(self, x):
+        # x: (B, T, F)
+        B_size, T_size, _ = x.shape
+        # e: (BT, F)
+        e = self.linear_in(x.reshape(B_size * T_size, -1))
+        # Encoder stack
+        for i in range(self.n_layers):
+            # layer normalization
+            e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
+            # self-attention
+            s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
+            # residual
+            e = e + self.dropout(s)
+            # layer normalization
+            e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
+            # positionwise feed-forward
+            s = getattr(self, '{}{:d}'.format('ff_', i))(e)
+            # residual
+            e = e + self.dropout(s)
+        # final layer normalization
+        # output: (BT, F)
+        # output: (B, T, F)
+        output = self.lnorm_out(e).reshape(B_size, T_size, -1)
+        return output
+
+
+class OutLayer(nn.Module):
+
+    def __init__(self, n_units=256, num_anchors=2):
+        super(OutLayer, self).__init__()
+        self.rnn_combine = TransformerEncoder_out(num_anchors * n_units,
+                                                  n_units)
+        self.out_linear = nn.Linear(n_units // num_anchors, 1)
+
+    def forward(self, input):
+        # input: [B, num_anchors, T, dim]
+        bs, num, tframe, dim = input.size()
+        output = input.permute(0, 2, 1,
+                               3).reshape(bs, tframe,
+                                          -1)  # [Bs, t, num_anchors*dim]
+        output = self.rnn_combine(output)  # [Bs, t, n_units]
+        output = output.reshape(
+            bs, tframe, num, -1)  # [Bs, t, num_anchors, n_units//num_anchors]
+        output = self.out_linear(output).squeeze(-1)  # [Bs, t, num_anchors]
+
+        return output
+
+
+class TransformerDetector(nn.Module):
+
+    def __init__(self,
+                 frame_dim=512,
+                 anchor_dim=192,
+                 hidden_dim=256,
+                 max_seq_len=500):
+        super(TransformerDetector, self).__init__()
+        self.detection = TransformerEncoder(
+            idim=frame_dim + anchor_dim, n_units=hidden_dim)
+        self.output = OutLayer(n_units=hidden_dim)
+        self.pos_enc = PosEncoding(max_seq_len, hidden_dim)
+
+    def forward(self, feats, anchors):
+        # feats: [1, t, fdim]
+        num_frames = feats.shape[1]
+        num_anchors = anchors.shape[1]
+        bs = feats.shape[0]
+        feats = feats.unsqueeze(1).repeat(
+            1, num_anchors, 1, 1)  # shape: [Bs, num_anchors, t, fdim]
+        anchors = anchors.unsqueeze(2).repeat(
+            1, 1, num_frames, 1)  # shape: [Bs, num_anchors, t, xdim]
+        sd_in = torch.cat((feats, anchors),
+                          dim=-1)  # shape: [Bs, num_anchors, t, fdim+xdim]
+        sd_out = self.detection(sd_in)  # shape: [Bs, num_anchors, t, sd_dim]
+
+        # pos
+        pos_emb = self.pos_enc(torch.tensor([num_frames] * (bs * num_anchors)))
+        pos_emb = pos_emb.reshape(bs, num_anchors, num_frames, -1)
+        sd_out += pos_emb
+
+        # output
+        output = self.output(sd_out)  # shape: [Bs, t, num_anchors]
+
+        return output
+
+
+@MODELS.register_module(
+    Tasks.speaker_diarization, module_name=Models.scl_sd_xvector)
+class SpeakerChangeLocatorTransformer(TorchModel):
+    r"""A speaekr change locator using the transformer architecture as the backbone.
+    Args:
+        model_dir: A model dir.
+        model_config: The model config.
+    """
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+
+        self.feature_dim = self.model_config['fbank_dim']
+        frame_size = self.model_config['frame_size']
+        anchor_size = self.model_config['anchor_size']
+        self.device = create_device(kwargs['device'])
+
+        self.encoder = Xvector(in_channels=self.feature_dim)
+        self.backend = TransformerDetector(
+            frame_dim=frame_size, anchor_dim=anchor_size)
+
+        pretrained_encoder = kwargs['pretrained_encoder']
+        pretrained_backend = kwargs['pretrained_backend']
+
+        self.__load_check_point(pretrained_encoder, pretrained_backend)
+
+        self.encoder.to(self.device)
+        self.backend.to(self.device)
+        self.encoder.eval()
+        self.backend.eval()
+
+    def forward(self, audio, anchors):
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        if isinstance(anchors, np.ndarray):
+            anchors = torch.from_numpy(anchors)
+        assert len(audio.shape) == 2 and audio.shape[
+            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
+        assert len(
+            anchors.shape
+        ) == 3 and anchors.shape[0] == 1 and anchors.shape[
+            1] == 2, 'modelscope error: the shape of input anchors to model needs to be [1, 2, D]'
+        # audio shape: [1, T]
+        feature = self.__extract_feature(audio)
+        frame_state = self.encoder(feature.to(self.device))
+        output = self.backend(frame_state, anchors.to(self.device))
+        output = output.squeeze(0).detach().cpu().sigmoid()
+
+        time_scale_factor = int(np.ceil(feature.shape[1] / output.shape[0]))
+        output = output.unsqueeze(1).expand(-1, time_scale_factor,
+                                            -1).reshape(-1, output.shape[-1])
+        return output
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(
+        self,
+        pretrained_encoder,
+        pretrained_backend,
+    ):
+        self.encoder.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_encoder),
+                map_location=torch.device('cpu')))
+
+        self.backend.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_backend),
+                map_location=torch.device('cpu')))
diff --git a/modelscope/pipelines/audio/segmentation_clustering_pipeline.py b/modelscope/pipelines/audio/segmentation_clustering_pipeline.py
index 326d8787..e4810bcf 100644
--- a/modelscope/pipelines/audio/segmentation_clustering_pipeline.py
+++ b/modelscope/pipelines/audio/segmentation_clustering_pipeline.py
@@ -92,8 +92,9 @@ class SegmentationClusteringPipeline(Pipeline):
     def forward(self, input: list) -> np.ndarray:
         embeddings = []
         for s in input:
-            _, embs = self.sv_pipeline([s[2]], output_emb=True)
-            embeddings.append(embs)
+            save_dict = self.sv_pipeline([s[2]], output_emb=True)
+            if save_dict['embs'].shape == (1, 192):
+                embeddings.append(save_dict['embs'])
         embeddings = np.concatenate(embeddings)
         return embeddings
 
diff --git a/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py b/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py
index ef91d83b..ba28ed6e 100644
--- a/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py
+++ b/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py
@@ -3,8 +3,10 @@
 import io
 from typing import Any, Dict, List, Union
 
+import numpy as np
 import soundfile as sf
 import torch
+import torchaudio
 
 from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
@@ -46,64 +48,111 @@ class ERes2Net_Pipeline(Pipeline):
         self.model_config = self.model.model_config
         self.config = self.model.other_config
         self.thr = self.config['yesOrno_thr']
+        self.save_dict = {}
 
     def __call__(self,
-                 in_audios: List[str],
-                 thr: float = None) -> Dict[str, Any]:
+                 in_audios: Union[np.ndarray, list],
+                 save_dir: str = None,
+                 output_emb: bool = False,
+                 thr: float = None):
         if thr is not None:
             self.thr = thr
         if self.thr < -1 or self.thr > 1:
             raise ValueError(
                 'modelscope error: the thr value should be in [-1, 1], but found to be %f.'
                 % self.thr)
-        outputs = self.preprocess(in_audios)
-        outputs = self.forward(outputs)
-        outputs = self.postprocess(outputs)
-
-        return outputs
-
-    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        emb1 = self.model(inputs['data1'])
-        emb2 = self.model(inputs['data2'])
-
-        return {'emb1': emb1, 'emb2': emb2}
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        score = self.compute_cos_similarity(inputs['emb1'], inputs['emb2'])
-        score = round(score, 5)
-        if score >= self.thr:
-            ans = 'yes'
+        wavs = self.preprocess(in_audios)
+        embs = self.forward(wavs)
+        outputs = self.postprocess(embs, in_audios, save_dir)
+        if output_emb:
+            self.save_dict['outputs'] = outputs
+            self.save_dict['embs'] = embs.numpy()
+            return self.save_dict
         else:
-            ans = 'no'
+            return outputs
 
-        return {OutputKeys.SCORE: score, OutputKeys.TEXT: ans}
+    def forward(self, inputs: list):
+        embs = []
+        for x in inputs:
+            embs.append(self.model(x))
+        embs = torch.cat(embs)
+        return embs
 
-    def preprocess(self, inputs: List[str],
-                   **preprocess_params) -> Dict[str, Any]:
-        if len(inputs) != 2:
-            raise ValueError(
-                'modelscope error: Two input audio files are required.')
-        output = {}
+    def postprocess(self,
+                    inputs: torch.Tensor,
+                    in_audios: Union[np.ndarray, list],
+                    save_dir=None):
+        if isinstance(in_audios[0], str) and save_dir is not None:
+            # save the embeddings
+            os.makedirs(save_dir, exist_ok=True)
+            for i, p in enumerate(in_audios):
+                save_path = os.path.join(
+                    save_dir, '%s.npy' %
+                    (os.path.basename(p).rsplit('.', 1)[0]))
+                np.save(save_path, inputs[i].numpy())
+
+        if len(inputs) == 2:
+            # compute the score
+            score = self.compute_cos_similarity(inputs[0], inputs[1])
+            score = round(score, 5)
+            if score >= self.thr:
+                ans = 'yes'
+            else:
+                ans = 'no'
+            output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans}
+        else:
+            output = {OutputKeys.TEXT: 'No similarity score output'}
+
+        return output
+
+    def preprocess(self, inputs: Union[np.ndarray, list]):
+        output = []
         for i in range(len(inputs)):
             if isinstance(inputs[i], str):
                 file_bytes = File.read(inputs[i])
                 data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
                 if len(data.shape) == 2:
                     data = data[:, 0]
+                data = torch.from_numpy(data).unsqueeze(0)
                 if fs != self.model_config['sample_rate']:
-                    raise ValueError(
-                        'modelscope error: Only support %d sample rate files'
-                        % self.model_cfg['sample_rate'])
-                output['data%d' %
-                       (i + 1)] = torch.from_numpy(data).unsqueeze(0)
+                    logger.warning(
+                        'The sample rate of audio is not %d, resample it.'
+                        % self.model_config['sample_rate'])
+                    data, fs = torchaudio.sox_effects.apply_effects_tensor(
+                        data,
+                        fs,
+                        effects=[[
+                            'rate',
+                            str(self.model_config['sample_rate'])
+                        ]])
+                data = data.squeeze(0)
+            elif isinstance(inputs[i], np.ndarray):
+                assert len(
+                    inputs[i].shape
+                ) == 1, 'modelscope error: Input array should be [N, T]'
+                data = inputs[i]
+                if data.dtype in ['int16', 'int32', 'int64']:
+                    data = (data / (1 << 15)).astype('float32')
+                else:
+                    data = data.astype('float32')
+                data = torch.from_numpy(data)
             else:
                 raise ValueError(
-                    'modelscope error: The input type is temporarily restricted to audio file address'
-                    % i)
+                    'modelscope error: The input type is restricted to audio address and nump array.'
+                )
+            output.append(data)
         return output
 
-    def compute_cos_similarity(self, emb1: torch.Tensor,
-                               emb2: torch.Tensor) -> float:
+    def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor],
+                               emb2: Union[np.ndarray, torch.Tensor]) -> float:
+        if isinstance(emb1, np.ndarray):
+            emb1 = torch.from_numpy(emb1)
+        if isinstance(emb2, np.ndarray):
+            emb2 = torch.from_numpy(emb2)
+        if len(emb1.shape):
+            emb1 = emb1.unsqueeze(0)
+        if len(emb2.shape):
+            emb2 = emb2.unsqueeze(0)
         assert len(emb1.shape) == 2 and len(emb2.shape) == 2
         cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
         cosine = cos(emb1, emb2)
diff --git a/modelscope/pipelines/audio/speaker_verification_light_pipeline.py b/modelscope/pipelines/audio/speaker_verification_light_pipeline.py
index e3d1968a..7bfc7964 100644
--- a/modelscope/pipelines/audio/speaker_verification_light_pipeline.py
+++ b/modelscope/pipelines/audio/speaker_verification_light_pipeline.py
@@ -50,6 +50,7 @@ class SpeakerVerificationPipeline(Pipeline):
         self.model_config = self.model.model_config
         self.config = self.model.other_config
         self.thr = self.config['yesOrno_thr']
+        self.save_dict = {}
 
     def __call__(self,
                  in_audios: Union[np.ndarray, list],
@@ -66,7 +67,9 @@ class SpeakerVerificationPipeline(Pipeline):
         embs = self.forward(wavs)
         outputs = self.postprocess(embs, in_audios, save_dir)
         if output_emb:
-            return outputs, embs.numpy()
+            self.save_dict['outputs'] = outputs
+            self.save_dict['embs'] = embs.numpy()
+            return self.save_dict
         else:
             return outputs
 
diff --git a/tests/pipelines/test_speaker_verification.py b/tests/pipelines/test_speaker_verification.py
index 34e5a9d9..c5fe0004 100644
--- a/tests/pipelines/test_speaker_verification.py
+++ b/tests/pipelines/test_speaker_verification.py
@@ -23,8 +23,10 @@ class SpeakerVerificationTest(unittest.TestCase):
     campplus_voxceleb_16k_model_id = 'damo/speech_campplus_sv_en_voxceleb_16k'
     rdino_voxceleb_16k_model_id = 'damo/speech_rdino_ecapa_tdnn_sv_en_voxceleb_16k'
     speaker_change_locating_cn_model_id = 'damo/speech_campplus-transformer_scl_zh-cn_16k-common'
+    speaker_change_lcoating_xvector_cn_model_id = 'damo/speech_xvector_transformer_scl_zh-cn_16k-common'
     eres2net_voxceleb_16k_model_id = 'damo/speech_eres2net_sv_en_voxceleb_16k'
     speaker_diarization_model_id = 'damo/speech_campplus_speaker-diarization_common'
+    speaker_diarization_eres2net_model_id = 'damo/speech_eres2net-large_speaker-diarization_common'
     lre_campplus_en_cn_16k_model_id = 'damo/speech_campplus_lre_en-cn_16k'
     lre_eres2net_base_en_cn_16k_model_id = 'damo/speech_eres2net_base_lre_en-cn_16k'
     lre_eres2net_large_en_cn_16k_model_id = 'damo/speech_eres2net_large_lre_en-cn_16k'
@@ -123,6 +125,17 @@ class SpeakerVerificationTest(unittest.TestCase):
         print(result)
         self.assertTrue(OutputKeys.TEXT in result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_speaker_change_locating_xvector_cn_16k(self):
+        logger.info(
+            'Run speaker change locating for xvector-transformer model')
+        result = self.run_pipeline(
+            model_id=self.speaker_change_lcoating_xvector_cn_model_id,
+            task=Tasks.speaker_diarization,
+            audios=SCL_EXAMPLE_WAV)
+        print(result)
+        self.assertTrue(OutputKeys.TEXT in result)
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_speaker_verification_eres2net_voxceleb_16k(self):
         logger.info('Run speaker verification for eres2net_voxceleb_16k model')
@@ -140,7 +153,7 @@ class SpeakerVerificationTest(unittest.TestCase):
         result = self.run_pipeline(
             model_id=self.eres2net_aug_zh_cn_16k_common_model_id,
             audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER1_B_EN_16K_WAV],
-            model_revision='v1.0.4')
+            model_revision='v1.0.5')
         print(result)
         self.assertTrue(OutputKeys.SCORE in result)
 
@@ -154,6 +167,16 @@ class SpeakerVerificationTest(unittest.TestCase):
         print(result)
         self.assertTrue(OutputKeys.TEXT in result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_eres2net_speaker_diarization_common(self):
+        logger.info('Run eres2net speaker diarization task')
+        result = self.run_pipeline(
+            model_id=self.speaker_diarization_eres2net_model_id,
+            task=Tasks.speaker_diarization,
+            audios=SD_EXAMPLE_WAV)
+        print(result)
+        self.assertTrue(OutputKeys.TEXT in result)
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_language_recognition_campplus_en_cn_16k(self):
         logger.info('Run language recognition for campplus_en_cn_16k')

From ae00f39b44ed40e642f26272c662c7dce682b0fd Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Thu, 26 Oct 2023 13:24:06 +0800
Subject: [PATCH 12/18] fix shop_segmentation to use old timm lib Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14443535 * fix
 shop_segmentation to use old timm lib

---
 modelscope/models/cv/shop_segmentation/models.py   | 4 ++--
 modelscope/models/cv/shop_segmentation/neck_fpn.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
index 1b07a08c..a206e9f1 100644
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -11,8 +11,8 @@ from collections import OrderedDict
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from timm.layers.drop import drop_path
-from timm.layers.weight_init import trunc_normal_
+from timm.models.layers.drop import drop_path
+from timm.models.layers.weight_init import trunc_normal_
 from torch import nn
 
 
diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py
index 12c11d76..d344de71 100644
--- a/modelscope/models/cv/shop_segmentation/neck_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -8,8 +8,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
-from timm.layers.drop import drop_path
-from timm.layers.weight_init import trunc_normal_
+from timm.models.layers.drop import drop_path
+from timm.models.layers.weight_init import trunc_normal_
 
 from .common import resize
 

From 3b4a7b48ef3b68dbfd9fa09af66a00ddb8251c99 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Thu, 26 Oct 2023 19:16:35 +0800
Subject: [PATCH 13/18] fix shop_segmentation to use old timm lib and bump
 version to 1.9.4rc2

---
 modelscope/models/cv/shop_segmentation/head_fpn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py
index dfa284d4..0d4027cb 100644
--- a/modelscope/models/cv/shop_segmentation/head_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -9,8 +9,8 @@ import numpy as np
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
-from timm.layers.drop import drop_path
-from timm.layers.weight_init import trunc_normal_
+from timm.models.layers.drop import drop_path
+from timm.models.layers.weight_init import trunc_normal_
 
 from .common import Upsample, resize
 

From acc61da91c1b62ec4d1bf57d01ff7f944c25f9c9 Mon Sep 17 00:00:00 2001
From: "rujiao.lrj" <rujiao.lrj@alibaba-inc.com>
Date: Fri, 27 Oct 2023 16:51:50 +0800
Subject: [PATCH 14/18] Fix the numpy bug for card correction Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14461396 * fix the
 numpy bug for card detection correction

* add output(score, angle, print type)
---
 modelscope/outputs/outputs.py                   | 11 ++++++-----
 .../cv/card_detection_correction_pipeline.py    | 17 +++++++++++++++--
 .../pipelines/cv/ocr_utils/table_process.py     |  4 ++--
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index 368abad6..a32fc157 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -442,8 +442,10 @@ TASK_OUTPUTS = {
     Tasks.table_recognition: [OutputKeys.POLYGONS],
     Tasks.lineless_table_recognition: [OutputKeys.POLYGONS, OutputKeys.BOXES],
     Tasks.license_plate_detection: [OutputKeys.POLYGONS, OutputKeys.TEXT],
-    Tasks.card_detection_correction:
-    [OutputKeys.POLYGONS, OutputKeys.OUTPUT_IMGS],
+    Tasks.card_detection_correction: [
+        OutputKeys.POLYGONS, OutputKeys.SCORES, OutputKeys.OUTPUT_IMGS,
+        OutputKeys.LABELS, OutputKeys.LAYOUT
+    ],
 
     # ocr recognition result for single sample
     # {
@@ -672,9 +674,8 @@ TASK_OUTPUTS = {
     #           np.array # 2D array containing only 0, 1
     #       ]
     #   }
-    Tasks.image_segmentation: [
-        OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS
-    ],
+    Tasks.image_segmentation:
+    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS],
 
     # video panoptic segmentation result for single sample
     #         "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]]
diff --git a/modelscope/pipelines/cv/card_detection_correction_pipeline.py b/modelscope/pipelines/cv/card_detection_correction_pipeline.py
index dac174de..c4b87d0d 100644
--- a/modelscope/pipelines/cv/card_detection_correction_pipeline.py
+++ b/modelscope/pipelines/cv/card_detection_correction_pipeline.py
@@ -172,13 +172,19 @@ class CardDetectionCorrection(Pipeline):
         wh = output['wh']
         reg = output['reg']
         angle_cls = output['cls'].sigmoid_()
+        ftype_cls = output['ftype'].sigmoid_()
 
         bbox, inds = bbox_decode(hm, wh, reg=reg, K=self.K)
         angle_cls = decode_by_ind(
             angle_cls, inds, K=self.K).detach().cpu().numpy()
+        ftype_cls = decode_by_ind(
+            ftype_cls, inds,
+            K=self.K).detach().cpu().numpy().astype(np.float32)
         bbox = bbox.detach().cpu().numpy()
         for i in range(bbox.shape[1]):
             bbox[0][i][9] = angle_cls[0][i]
+        bbox = np.concatenate((bbox, np.expand_dims(ftype_cls, axis=-1)),
+                              axis=-1)
         bbox = nms(bbox, 0.3)
         bbox = bbox_post_process(bbox.copy(), [meta['c'].cpu().numpy()],
                                  [meta['s']], meta['out_height'],
@@ -187,6 +193,8 @@ class CardDetectionCorrection(Pipeline):
         res = []
         angle = []
         sub_imgs = []
+        ftype = []
+        score = []
         for idx, box in enumerate(bbox[0]):
             if box[8] > 0.3:
                 angle.append(int(box[9]))
@@ -200,9 +208,14 @@ class CardDetectionCorrection(Pipeline):
                 if angle[-1] == 3:
                     sub_img = cv2.rotate(sub_img, 0)
                 sub_imgs.append(sub_img)
+                ftype.append(int(box[10]))
+                score.append(box[8])
 
         result = {
-            OutputKeys.POLYGONS: np.array(res),
-            OutputKeys.OUTPUT_IMGS: np.array(sub_imgs)
+            OutputKeys.POLYGONS: res,
+            OutputKeys.SCORES: score,
+            OutputKeys.OUTPUT_IMGS: sub_imgs,
+            OutputKeys.LABELS: angle,
+            OutputKeys.LAYOUT: np.array(ftype)
         }
         return result
diff --git a/modelscope/pipelines/cv/ocr_utils/table_process.py b/modelscope/pipelines/cv/ocr_utils/table_process.py
index 3bf28e84..f67bfc72 100644
--- a/modelscope/pipelines/cv/ocr_utils/table_process.py
+++ b/modelscope/pipelines/cv/ocr_utils/table_process.py
@@ -232,13 +232,13 @@ def nms(dets, thresh):
     keep = []
     for i in range(len(dets)):
         box = dets[i]
-        if box[-1] < thresh:
+        if box[8] < thresh:
             break
         max_score_index = -1
         ctx = (dets[i][0] + dets[i][2] + dets[i][4] + dets[i][6]) / 4
         cty = (dets[i][1] + dets[i][3] + dets[i][5] + dets[i][7]) / 4
         for j in range(len(dets)):
-            if i == j or dets[j][-1] < thresh:
+            if i == j or dets[j][8] < thresh:
                 break
             x1, y1 = dets[j][0], dets[j][1]
             x2, y2 = dets[j][2], dets[j][3]

From 5f0fcd7bfd6be81f5a545584714b2de8956f561b Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Sat, 4 Nov 2023 12:11:13 +0800
Subject: [PATCH 15/18] fix uie trainer: eval failed (#617)

* fix uie trainer

* merge master
---
 modelscope/trainers/audio/asr_trainer.py       | 3 ++-
 modelscope/trainers/nlp/siamese_uie_trainer.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/modelscope/trainers/audio/asr_trainer.py b/modelscope/trainers/audio/asr_trainer.py
index 1162d6ff..04d57f51 100644
--- a/modelscope/trainers/audio/asr_trainer.py
+++ b/modelscope/trainers/audio/asr_trainer.py
@@ -126,7 +126,8 @@ class ASRTrainer(BaseTrainer):
             cfg_dict['cmvn_file'] = os.path.join(
                 cur_dir, config['model']['model_config']['mvn_file'])
             cfg_dict['seg_dict'] = os.path.join(cur_dir, 'seg_dict')
-            cfg_dict['bpemodel'] = os.path.join(cur_dir, config['model']['model_config']['bpemodel'])
+            cfg_dict['bpemodel'] = os.path.join(
+                cur_dir, config['model']['model_config']['bpemodel'])
             if 'init_model' in config['model']['model_config']:
                 cfg_dict['init_model'] = os.path.join(
                     cur_dir, config['model']['model_config']['init_model'])
diff --git a/modelscope/trainers/nlp/siamese_uie_trainer.py b/modelscope/trainers/nlp/siamese_uie_trainer.py
index 782fd360..d0179b9a 100644
--- a/modelscope/trainers/nlp/siamese_uie_trainer.py
+++ b/modelscope/trainers/nlp/siamese_uie_trainer.py
@@ -329,7 +329,7 @@ class SiameseUIETrainer(EpochBasedTrainer):
             {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
         """
         pipeline_uie = pipeline(
-            Tasks.siamese_uie, self.model, device=self.device)
+            Tasks.siamese_uie, self.model, device=str(self.device))
         if checkpoint_path is not None and os.path.isfile(checkpoint_path):
             from modelscope.trainers.hooks import LoadCheckpointHook
             LoadCheckpointHook.load_checkpoint(checkpoint_path, self)

From fafb0fe0135b822c04b786187e472b478265b014 Mon Sep 17 00:00:00 2001
From: Firmament-cyou <57580313+Firmament-cyou@users.noreply.github.com>
Date: Sat, 4 Nov 2023 20:44:55 +0800
Subject: [PATCH 16/18] llm pipeline support chatglm3 (#618)

---
 modelscope/pipelines/builder.py          | 47 +++-----------
 modelscope/pipelines/nlp/llm_pipeline.py | 78 ++++++++++++++++++++++--
 tests/pipelines/test_llm_pipeline.py     |  7 +++
 3 files changed, 88 insertions(+), 44 deletions(-)

diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index ddc3c422..f44f7381 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -1,16 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-import os.path as osp
 from typing import List, Optional, Union
 
-from modelscope.hub.file_download import model_file_download
 from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.metainfo import DEFAULT_MODEL_FOR_PIPELINE, Pipelines
+from modelscope.metainfo import DEFAULT_MODEL_FOR_PIPELINE
 from modelscope.models.base import Model
-from modelscope.utils.config import Config, ConfigDict, check_config
+from modelscope.utils.config import ConfigDict, check_config
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke,
-                                       ModelFile, ThirdParty)
+                                       ThirdParty)
 from modelscope.utils.hub import read_config
 from modelscope.utils.plugins import (register_modelhub_repo,
                                       register_plugins_repo)
@@ -205,42 +203,13 @@ def get_default_pipeline_info(task):
 
 def llm_first_checker(model: Union[str, List[str], Model, List[Model]],
                       revision: Optional[str]) -> Optional[str]:
-    from modelscope.pipelines.nlp.llm_pipeline import LLM_FORMAT_MAP
-
-    def get_file_name(model: str, cfg_name: str,
-                      revision: Optional[str]) -> Optional[str]:
-        if osp.exists(model):
-            return osp.join(model, cfg_name)
-        try:
-            return model_file_download(model, cfg_name, revision=revision)
-        except Exception:
-            return None
-
-    def parse_and_get(file: Optional[str], pattern: str) -> Optional[str]:
-        if file is None or not osp.exists(file):
-            return None
-        return Config.from_file(file).safe_get(pattern)
-
-    def get_model_type(model: str, revision: Optional[str]) -> Optional[str]:
-        cfg_file = get_file_name(model, ModelFile.CONFIGURATION, revision)
-        hf_cfg_file = get_file_name(model, ModelFile.CONFIG, revision)
-        cfg_model_type = parse_and_get(cfg_file, 'model.type')
-        hf_cfg_model_type = parse_and_get(hf_cfg_file, 'model_type')
-        return cfg_model_type or hf_cfg_model_type
-
-    def get_adapter_type(model: str, revision: Optional[str]) -> Optional[str]:
-        cfg_file = get_file_name(model, ModelFile.CONFIGURATION, revision)
-        model = parse_and_get(cfg_file, 'adapter_cfg.model_id_or_path')
-        revision = parse_and_get(cfg_file, 'adapter_cfg.model_revision')
-        return None if model is None else get_model_type(model, revision)
+    from .nlp.llm_pipeline import ModelTypeHelper, LLM_FORMAT_MAP
 
     if isinstance(model, list):
         model = model[0]
     if not isinstance(model, str):
         model = model.model_dir
-    model_type = get_model_type(model, revision) \
-        or get_adapter_type(model, revision)
-    if model_type is not None:
-        model_type = model_type.lower().split('-')[0]
-        if model_type in LLM_FORMAT_MAP:
-            return 'llm'
+    model_type = ModelTypeHelper.get(
+        model, revision, with_adapter=True, split='-')
+    if model_type in LLM_FORMAT_MAP:
+        return 'llm'
diff --git a/modelscope/pipelines/nlp/llm_pipeline.py b/modelscope/pipelines/nlp/llm_pipeline.py
index a2e65ee9..5cd2dcb1 100644
--- a/modelscope/pipelines/nlp/llm_pipeline.py
+++ b/modelscope/pipelines/nlp/llm_pipeline.py
@@ -1,7 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import os.path as osp
 from contextlib import contextmanager
-from typing import Any, Callable, Dict, Iterator, List, Tuple, Union
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
 
 import json
 import torch
@@ -22,6 +23,57 @@ from modelscope.utils.logger import get_logger
 logger = get_logger()
 
 
+class ModelTypeHelper:
+
+    @staticmethod
+    def _get_file_name(model: str, cfg_name: str,
+                       revision: Optional[str]) -> Optional[str]:
+        if osp.exists(model):
+            return osp.join(model, cfg_name)
+        try:
+            return model_file_download(model, cfg_name, revision=revision)
+        except Exception:
+            return None
+
+    @staticmethod
+    def _parse_and_get(file: Optional[str], pattern: str) -> Optional[str]:
+        if file is None or not osp.exists(file):
+            return None
+        return Config.from_file(file).safe_get(pattern)
+
+    @classmethod
+    def _get(cls, model: str, revision: Optional[str]) -> Optional[str]:
+        cfg_file = cls._get_file_name(model, ModelFile.CONFIGURATION, revision)
+        hf_cfg_file = cls._get_file_name(model, ModelFile.CONFIG, revision)
+        cfg_model_type = cls._parse_and_get(cfg_file, 'model.type')
+        hf_cfg_model_type = cls._parse_and_get(hf_cfg_file, 'model_type')
+        return cfg_model_type or hf_cfg_model_type
+
+    @classmethod
+    def _get_adapter(cls, model: str,
+                     revision: Optional[str]) -> Optional[str]:
+        cfg_file = cls._get_file_name(model, ModelFile.CONFIGURATION, revision)
+        model = cls._parse_and_get(cfg_file, 'adapter_cfg.model_id_or_path')
+        revision = cls._parse_and_get(cfg_file, 'adapter_cfg.model_revision')
+        return None if model is None else cls._get(model, revision)
+
+    @classmethod
+    def get(cls,
+            model: str,
+            revision: Optional[str] = None,
+            with_adapter: bool = False,
+            split: Optional[str] = None) -> Optional[str]:
+        model_type = cls._get(model, revision)
+        if model_type is None and with_adapter:
+            model_type = cls._get_adapter(model, revision)
+        if model_type is None:
+            return None
+        model_type = model_type.lower()
+        if split is None:
+            return model_type
+        return model_type.split(split)[0]
+
+
 @PIPELINES.register_module(Tasks.chat, module_name='llm')
 @PIPELINES.register_module(Tasks.text_generation, module_name='llm')
 class LLMPipeline(Pipeline):
@@ -121,8 +173,7 @@ class LLMPipeline(Pipeline):
                 format_messages]
 
         if format_messages is None:
-            model_type = self.cfg.safe_get('model.type',
-                                           '').lower().split('-')[0]
+            model_type = ModelTypeHelper.get(self.model.model_dir, split='-')
             if model_type in LLM_FORMAT_MAP:
                 format_messages, format_output, tokenizer_class = LLM_FORMAT_MAP[
                     model_type]
@@ -192,7 +243,10 @@ class LLMPipeline(Pipeline):
             device = 'cpu'
         else:
             raise ValueError('model does not have `device` attribute!')
-        return {k: v.to(device) for k, v in tokens.items()}
+        return {
+            k: (v.to(device) if isinstance(v, torch.Tensor) else v)
+            for k, v in tokens.items()
+        }
 
     def postprocess(self, outputs, is_messages: bool, **kwargs):
 
@@ -488,6 +542,19 @@ def wizardcode_format_messages(messages, tokenizer, **kwargs):
     return inputs
 
 
+def chatglm3_format_messages(messages, tokenizer, **kwargs):
+    messages = messages['messages']
+    query, history = messages[-1]['content'], messages[:-1]
+    inputs = tokenizer.build_chat_input(query, history=history)
+    eos_token_id = [
+        tokenizer.eos_token_id,
+        tokenizer.get_command('<|user|>'),
+        tokenizer.get_command('<|observation|>')
+    ]
+    inputs['eos_token_id'] = eos_token_id
+    return inputs
+
+
 LLM_FORMAT_MAP = {
     'chatglm2':
     (chatglm2_format_messages, chatglm2_format_output, ChatGLM2Tokenizer),
@@ -497,5 +564,6 @@ LLM_FORMAT_MAP = {
     'baichuan': (baichuan_format_messages, None, None),
     'baichuan2': (baichuan_format_messages, None, None),
     'wizardlm': (wizardlm_format_messages, None, None),
-    'wizardcode': (wizardcode_format_messages, None, None)
+    'wizardcode': (wizardcode_format_messages, None, None),
+    'chatglm': (chatglm3_format_messages, chatglm2_format_output, None),
 }
diff --git a/tests/pipelines/test_llm_pipeline.py b/tests/pipelines/test_llm_pipeline.py
index 9b7e832f..b5ace810 100644
--- a/tests/pipelines/test_llm_pipeline.py
+++ b/tests/pipelines/test_llm_pipeline.py
@@ -152,6 +152,13 @@ class LLMPipelineTest(unittest.TestCase):
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_chatglm3(self):
+        pipe = pipeline(
+            task='chat', model='ZhipuAI/chatglm3-6b', llm_first=True)
+        print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
+        print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_llama2(self):
         pipe = pipeline(

From 02ce95d5b8b8a2ea88ebfe3610792d87ec94584a Mon Sep 17 00:00:00 2001
From: wenmeng zhou <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 8 Nov 2023 00:51:10 +0800
Subject: [PATCH 17/18] add llm riddles (#621)

* add multi online support

* add readme and requirements.txt
---
 examples/apps/llm_riddles/README.md        |  49 +++
 examples/apps/llm_riddles/README_CN.md     |  49 +++
 examples/apps/llm_riddles/app.py           | 417 +++++++++++++++++++++
 examples/apps/llm_riddles/requirements.txt |   3 +
 4 files changed, 518 insertions(+)
 create mode 100644 examples/apps/llm_riddles/README.md
 create mode 100644 examples/apps/llm_riddles/README_CN.md
 create mode 100644 examples/apps/llm_riddles/app.py
 create mode 100644 examples/apps/llm_riddles/requirements.txt

diff --git a/examples/apps/llm_riddles/README.md b/examples/apps/llm_riddles/README.md
new file mode 100644
index 00000000..445d2364
--- /dev/null
+++ b/examples/apps/llm_riddles/README.md
@@ -0,0 +1,49 @@
+# Oh No! I'm Surrounded by LLMs! (LLMRiddles)
+
+## Project Introduction
+"Oh No! I'm Surrounded by LLMs!" is an intellectual challenge game. We use GPT4 to automatically generate corresponding game code based on existing Large Language Model (LLM) dialogue Gradio application codes within the ModelScope community, combined with preset questions from the Zhihu article ["How to Accomplish Tasks with 'Impossible'"](https://zhuanlan.zhihu.com/p/665393240), creating a unique gameplay experience. In this stream, players are required to cleverly construct questions that challenge the LLM to provide answers that meet specific conditions.
+
+## Getting Started
+
+### Online Experience
+
+[LLMRiddles](https://modelscope.cn/studios/LLMRiddles/LLMRiddles/summary)
+
+### Local Execution
+To start the game, please follow the steps below:
+
+1. Clone the project code:
+   ```
+   git clone https://github.com/modelscope/modelscope.git
+   ```
+2. Navigate to the `examples/apps/llm_riddles` directory.
+3. Install the required Python dependencies with `pip install -r requirements.txt`.
+4. Run the launch command `python app.py`.
+
+## Roadmap
+- [x] Initial version source code and space experience ready.
+- [ ] Support for custom questions and validation logic integration.
+- [ ] Expand to 9 major levels, each with 9 questions.
+- [ ] Support for more open-source models.
+- [ ] Support for switching between cloud API and local inference.
+
+## Contribution Guide
+We welcome everyone to contribute to "Oh No! I'm Surrounded by LLMs!", including proposing more fun questions, fixing validator corner cases, and providing more gameplay. Please follow the steps below:
+
+1. Visit the project address [ModelScope](https://github.com/modelscope/modelscope) and fork the project.
+2. Create your feature branch in your local environment (`git checkout -b feature/AmazingFeature`).
+3. Commit your changes (`git commit -m 'Add some AmazingFeature'`).
+4. Push your changes to the branch (`git push origin feature/AmazingFeature`).
+5. Initiate a Pull Request in the original project.
+
+## Community Contributors
+We sincerely thank all community members who have contributed to this project, especially:
+
+- Idea from: [haoqiangfan](https://www.zhihu.com/people/haoqiang-fan)
+- Most of the code is auto-generated by GPT-4
+
+## Support
+If you encounter any problems or need assistance during the game, please submit your issues on the project's [Issues page](https://github.com/modelscope/modelscope/issues).
+
+## Copyright and License
+This project is licensed under the APACHE License. Please see the [LICENSE](https://github.com/modelscope/modelscope/blob/main/LICENSE) file in the project for more information.
diff --git a/examples/apps/llm_riddles/README_CN.md b/examples/apps/llm_riddles/README_CN.md
new file mode 100644
index 00000000..7141cd60
--- /dev/null
+++ b/examples/apps/llm_riddles/README_CN.md
@@ -0,0 +1,49 @@
+# 完蛋！我被LLM包围了！(LLMRiddles)
+
+## 项目简介
+《完蛋！我被LLM包围了！》是一款智力挑战游戏。该项目利用gpt4, 基于ModelScope社区内现有的LLM对话Gradio应用程序代码，结合知乎文章[《如何用“不可能”完成任务》](https://zhuanlan.zhihu.com/p/665393240)中的预设问题，自动生成了对应的游戏代码，创造了一个独特的游戏体验。在这个有溪中，玩家需要巧妙构造问题，挑战LLM给出满足特定条件的回答。
+
+## 开始游戏
+
+### 在线体验
+
+[LLMRiddles](https://modelscope.cn/studios/LLMRiddles/LLMRiddles/summary)
+
+### 本地运行
+要开始游戏，请按照以下步骤操作：
+
+1. 克隆项目代码：
+   ```
+   git clone https://github.com/modelscope/modelscope.git
+   ```
+2. 进入到`examples/apps/llm_riddles`目录。
+3. 安装所需的Python依赖`pip install -r requirements.txt`。
+4. 执行启动命令`python app.py`.
+
+## RoadMap
+- [x] 初版本源码和创空间体验ready
+- [ ] 支持自定义问题和验证逻辑接入
+- [ ] 扩充到9个大关卡，每个关卡9个问题
+- [ ] 支持更多开源模型
+- [ ] 支持云端API和本地推理切换
+
+## 贡献指南
+我们欢迎大家为《完蛋！我被LLM包围了！》做出贡献，包括提出更多好玩的问题，修复validator的corner case，以及提供更多的玩法。请按以下步骤操作：
+
+1. 访问项目地址 [ModelScope](https://github.com/modelscope/modelscope) 并fork项目。
+2. 在你的本地环境中创建你的特性分支 (`git checkout -b feature/AmazingFeature`)。
+3. 提交你的改动 (`git commit -m 'Add some AmazingFeature'`)。
+4. 将你的改动推送到分支上 (`git push origin feature/AmazingFeature`)。
+5. 在原项目下发起一个Pull Request。
+
+## 社区贡献者
+我们诚挚感谢所有对本项目做出贡献的社区成员，特别是：
+
+- idea来源: [haoqiangfan](https://www.zhihu.com/people/haoqiang-fan)
+- 代码大部分来自于GPT4自动生成
+
+## 支持
+如果你在游戏过程中遇到任何问题或需要帮助，请通过项目的[Issues页面](https://github.com/modelscope/modelscope/issues)提交你的问题。
+
+## 版权和许可
+本项目采用APACHE License许可证。请查看项目中的[LICENSE](https://github.com/modelscope/modelscope/blob/main/LICENSE)文件了解更多信息。
diff --git a/examples/apps/llm_riddles/app.py b/examples/apps/llm_riddles/app.py
new file mode 100644
index 00000000..dc5c1b7b
--- /dev/null
+++ b/examples/apps/llm_riddles/app.py
@@ -0,0 +1,417 @@
+import os
+import random
+import re
+from http import HTTPStatus
+
+import dashscope
+import gradio as gr
+import sympy
+
+dashscope.api_key = os.getenv('DASHSCOPE_API_KEY')
+
+# 定义关卡信息和验证逻辑
+
+
+# 辅助函数 - 检查是否为质数
+def is_prime(num):
+    return sympy.isprime(num)
+
+
+# 辅助函数 - 获取下一个质数
+def next_prime(num):
+    return sympy.nextprime(num)
+
+
+# 辅助函数 - 检查是否为平方数
+def is_square(n):
+    return sympy.sqrt(n).is_integer
+
+
+# 辅助函数 - 获取平方根
+def get_square_root(n):
+    return int(sympy.sqrt(n))
+
+
+def validate_palindrome_invariance(origin_input, response):
+    """
+    验证器函数，检查对于给定的问题，正着问和倒着问的模型回答是否一致。
+
+    :param origin_input: 原始问题
+    :return: True 如果回答一致，否则 False
+    """
+
+    # 将原始问题倒序并提交
+    reversed_question = origin_input[::-1]
+    reversed_response = generate_response(reversed_question)
+
+    # 比较两个回答是否一致
+    return response.strip() == reversed_response.strip()
+
+
+def validate_palindrome_inverse(origin_input, response):
+    """
+    验证器函数，检查对于给定的问题，正着问和倒着问的模型的回答本身不回文且也是逆序的关系。
+
+    :param origin_input: 原始问题
+    :param response: 模型对原始问题的回答
+    :param model_fn: 能够返回模型回答的函数
+    :return: True 如果回答本身不回文且也是逆序关系，否则 False
+    """
+
+    # 检查原始输入不是回文
+    if origin_input == origin_input[::-1]:
+        return False
+
+    # 将原始问题倒序并获取模型回答
+    reversed_question = origin_input[::-1]
+    reversed_response = generate_response(reversed_question).strip()
+
+    # 检查原始回答不是回文
+    if response == response[::-1]:
+        return False
+
+    # 比较正着问和倒着问的回答是否是逆序关系
+    return response == reversed_response[::-1]
+
+
+def validate_reciprocal_question(input, response):
+    """
+    验证器函数，检查给定的问题A和回答B，是否能够通过以B作为新的提问得到原始问题A作为回答。
+
+    :param response: 模型对问题A的回答B
+    :param input: 原始问题A
+    :param model_fn: 模型调用函数，接收问题并返回回答
+    :return: True 如果以回答B作为新的提问能得到问题A作为回答，否则 False
+    """
+
+    # 确保回答B不等于原始问题A
+    if response == input:
+        return False
+
+    # 以回答B作为新的提问
+    new_response = generate_response(response)
+
+    # 检查新的回答是否与原始问题A相等
+    return new_response.strip() == input.strip()
+
+
+challenges = [
+    {
+        'name':
+        '第一章 对话之趣',
+        'problems': [
+            {
+                'title': '第1题 初来乍到',
+                'description': '请你构造一个问题使模型的回答是一字不差的“1+1=3”（不需要引号）。',
+                'validator':
+                lambda response, input: response.strip() == '1+1=3'
+            },
+            {
+                'title':
+                '第2题 小试牛刀',
+                'description':
+                '请输入三个字以内的问题，使模型的回答在30个字以上。',
+                'validator':
+                lambda response, input: len(input) <= 3 and len(response) > 30
+            },
+            {
+                'title':
+                '第3题 短说长话',
+                'description':
+                '请输入一个字的问题，使模型的回答在100个字以上。',
+                'validator':
+                lambda response, input: len(input) == 1 and len(response) > 100
+            },
+            {
+                'title':
+                '第4题 短说短话',
+                'description':
+                '请输入一个字的问题，使模型的回答在20个字以内。',
+                'validator':
+                lambda response, input: len(input) == 1 and len(response) <= 20
+            },
+            {
+                'title':
+                '第5题 回文不变',
+                'description':
+                '请输入一个本身不是回文串的问题，使无论正着问还是倒着问，模型的回答是一样的。',
+                # 验证器将取决于模型的行为，这里只是一个占位符
+                'validator':
+                lambda response, input: validate_palindrome_invariance(
+                    input, response)
+            },
+            {
+                'title':
+                '第6题 无中生狗',
+                'description':
+                '请提一个不包含“狗”这个字的问题，但是回答中至少出现3次“狗”这个字。',
+                'validator':
+                lambda response, input: '狗' not in input and response.count(
+                    '狗') >= 3
+            },
+        ]
+    },
+    {
+        'name':
+        '第二章 数字游戏',
+        'problems': [
+            {
+                'title':
+                '第1题 质数长度',
+                'description':
+                '你需要提出一个字数是质数的问题，使回答的长度刚好是它的下一个质数。',
+                'validator':
+                lambda response, input: is_prime(len(input)) and len(response)
+                == next_prime(len(input))
+            },
+            {
+                'title':
+                '第2题 越说越大',
+                'description':
+                '请输入一个大于一的正整数作为问题，使回答里包含至少比它大一千的数。',
+                'validator':
+                lambda response, input:
+                (input.isdigit() and int(input) > 1 and any(
+                    int(num) > int(input) + 1000
+                    for num in re.findall(r'\d+', response)))
+            },
+            {
+                'title':
+                '第3题 越说越小',
+                'description':
+                '请输入一个大于一的正整数作为问题，使回答里包含至少10个不同的至少比它小一千的自然数。',
+                'validator':
+                lambda response, input:
+                (input.isdigit() and int(input) > 1 and (len({
+                    int(num)
+                    for num in re.findall(r'\d+', response)
+                    if int(num) < int(input) - 1000
+                }) >= 10))
+            },
+            {
+                'title':
+                '第4题 警世名言',
+                'description':
+                '请在不提及1,4,5这三个字符的情况下让模型回答114514，不要有多余的字符。',
+                'validator':
+                lambda response, input: response.strip() == '114514' and all(
+                    c not in input for c in ['1', '4', '5'])
+            },
+            {
+                'title':
+                '第5题 开平方数',
+                'description':
+                '请输入一个大于一的平方数作为问题，使回答里包含它的平方根对应的数。',
+                'validator':
+                lambda response, input:
+                (input.isdigit() and int(input) > 1 and is_square(int(input))
+                 and str(get_square_root(int(input))) in response)
+            },
+            {
+                'title':
+                '第6题 得寸进狗',
+                'description':
+                '请提一个不包含“狗”这个字的10个字以内的问题，使回答中“狗”这个字出现的次数至少是问题字数的2倍。',
+                'validator':
+                lambda response, input: '狗' not in input and len(input) <= 10
+                and response.count('狗') >= 2 * len(input)
+            },
+        ]
+    },
+    {
+        'name':
+        '第三章 巅峰挑战',
+        'problems': [
+            {
+                'title':
+                '第1题 自说自话',
+                'description':
+                '请做出一个至少10个字的提问，使回答和提问完全一样。',
+                'validator':
+                lambda response, input: input == response and len(input) >= 10
+            },
+            {
+                'title':
+                '第2题 话自说自',
+                'description':
+                '请做出一个至少10个字的提问，使回答是提问的逆序。',
+                'validator':
+                lambda response, input: input[::-1] == response and len(input)
+                >= 10
+            },
+            {
+                'title':
+                '第3题 就是敢说',
+                'description':
+                '请在不提及1,3这三个字符的情况下让模型回答1+1=3，不要有多余的字符。',
+                'validator':
+                lambda response, input: response.strip() == '1+1=3' and all(
+                    c not in input for c in ['1', '3'])
+            },
+            {
+                'title':
+                '第4题 回文协变',
+                'description':
+                '请输入一个本身不是回文串的问题，使正着问和倒着问的时候，模型的回答本身不回文且也是逆序的关系。',
+                'validator':
+                lambda response, input: validate_palindrome_inverse(
+                    input, response)
+            },
+        ]
+    },
+    {
+        'name':
+        '第四章 无人之境',
+        'problems': [
+            {
+                'title':
+                '第1题 循环往复',
+                'description':
+                '请构造一个问题A，它的回复是不同于A的B，然后以B提问能再次得到A.',
+                'validator':
+                lambda response, input: validate_reciprocal_question(
+                    input, response)
+            },
+            {
+                'title':
+                '第2题 惜字如金',
+                'description':
+                '请输入一个字的问题，使模型的回答在16个字以内。',
+                'validator':
+                lambda response, input: len(input) == 1 and len(response) <= 16
+            },
+            {
+                'title':
+                '第3题 自然之密',
+                'description':
+                '请输入一个大于一的正整数作为问题，使回答里包含和它刚好相差1的数。',
+                'validator':
+                lambda response, input:
+                (input.isdigit() and int(input) > 1 and any(
+                    abs(int(num) - int(input)) == 1
+                    for num in re.findall(r'\d+', response)))
+            },
+        ]
+    }
+]
+
+
+def get_problem(challenge_idx, problem_idx):
+    problems = challenges[challenge_idx]['problems']
+    return problems[problem_idx]
+
+
+def update_challenge_info(current_chapter_index, current_challenge_index):
+    return get_problem(current_chapter_index,
+                       current_challenge_index)['description']
+
+
+def update_question_info(current_chapter_index, current_challenge_index):
+    global challenges
+    current_chapter = challenges[current_chapter_index]
+    challenge = get_problem(current_chapter_index, current_challenge_index)
+    question_info = f"""\n<center><font size=4>{current_chapter["name"]}""" \
+                    f"""</center>\n\n <center><font size=3>{challenge["title"]}</center>"""
+    return question_info
+
+
+def validate_challenge(response, input, state):
+    print('in validate_challenge')
+    assert 'current_chapter_index' in state, 'current_chapter_index not found in state'
+    assert 'current_challenge_index' in state, 'current_challenge_index not found in state'
+    current_chapter_index = state['current_chapter_index']
+    current_challenge_index = state['current_challenge_index']
+    # 获取当前章节
+    current_chapter = challenges[current_chapter_index]
+    # 获取当前挑战
+    challenge = current_chapter['problems'][current_challenge_index]
+
+    if challenge['validator'](response, input):
+        challenge_result = '挑战成功！进入下一关。'
+        # 检查是否还有更多挑战在当前章节
+        if current_challenge_index < len(current_chapter['problems']) - 1:
+            # 移动到当前章节的下一个挑战
+            current_challenge_index += 1
+        else:
+            # 如果当前章节的挑战已经完成，移动到下一个章节
+            current_challenge_index = 0
+            if current_chapter_index < len(challenges) - 1:
+                current_chapter_index += 1
+            else:
+                challenge_result = '所有挑战完成！'
+    else:
+        challenge_result = '挑战失败，请再试一次。'
+    state['current_chapter_index'] = current_chapter_index
+    state['current_challenge_index'] = current_challenge_index
+    print('update state: ', state)
+
+    return challenge_result, \
+        update_question_info(current_chapter_index, current_challenge_index), \
+        update_challenge_info(current_chapter_index, current_challenge_index)
+
+
+def generate_response(input):
+    messages = [{
+        'role': 'system',
+        'content': """You are a helpful assistant."""
+    }, {
+        'role': 'user',
+        'content': input
+    }]
+    response = dashscope.Generation.call(
+        model='qwen-max',
+        messages=messages,
+        # set the random seed, optional, default to 1234 if not set
+        seed=random.randint(1, 10000),
+        result_format='message',  # set the result to be "message" format.
+        top_p=0.8)
+    if response.status_code == HTTPStatus.OK:
+        return response.output.choices[0].message.content
+    else:
+        gr.Error('网络连接错误，请重试。')
+
+
+def on_submit(input, state):
+    response = generate_response(input)
+    history = [(input, response)]
+    print(history)
+    challenge_result, question_info, challenge_info = validate_challenge(
+        response, input, state)
+    print('validate_challenge done')
+    return challenge_result, history, question_info, challenge_info
+
+
+# Gradio界面构建
+block = gr.Blocks()
+
+with block as demo:
+    state = gr.State(dict(current_challenge_index=0, current_chapter_index=0))
+    current_chapter_index = 0
+    current_challenge_index = 0
+    gr.Markdown("""<center><font size=6>完蛋！我被LLM包围了！</center>""")
+    gr.Markdown("""<font size=3>欢迎来玩LLM Riddles复刻版：完蛋！我被LLM包围了！
+
+你将通过本游戏对大型语言模型产生更深刻的理解。
+
+在本游戏中，你需要构造一个提给一个大型语言模型的问题，使得它回复的答案符合要求。""")
+    question_info = gr.Markdown(
+        update_question_info(current_chapter_index, current_challenge_index))
+    challenge_info = gr.Textbox(
+        value=update_challenge_info(current_chapter_index,
+                                    current_challenge_index),
+        label='当前挑战',
+        disabled=True)
+    challenge_result = gr.Textbox(label='挑战结果', disabled=True)
+    chatbot = gr.Chatbot(
+        lines=8, label='Qwen-max', elem_classes='control-height')
+    message = gr.Textbox(lines=2, label='输入')
+
+    with gr.Row():
+        submit = gr.Button('🚀 发送')
+
+    submit.click(
+        on_submit,
+        inputs=[message, state],
+        outputs=[challenge_result, chatbot, question_info, challenge_info])
+
+demo.queue().launch(height=800, share=True)
diff --git a/examples/apps/llm_riddles/requirements.txt b/examples/apps/llm_riddles/requirements.txt
new file mode 100644
index 00000000..371f609b
--- /dev/null
+++ b/examples/apps/llm_riddles/requirements.txt
@@ -0,0 +1,3 @@
+dashscope
+gradio
+sympy

From 069b271436be711859d5644bfe9298be5bfa6e1d Mon Sep 17 00:00:00 2001
From: wenmeng zhou <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 8 Nov 2023 12:19:53 +0800
Subject: [PATCH 18/18] Update README_CN.md

---
 examples/apps/llm_riddles/README_CN.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/apps/llm_riddles/README_CN.md b/examples/apps/llm_riddles/README_CN.md
index 7141cd60..08d7bf3c 100644
--- a/examples/apps/llm_riddles/README_CN.md
+++ b/examples/apps/llm_riddles/README_CN.md
@@ -1,7 +1,7 @@
 # 完蛋！我被LLM包围了！(LLMRiddles)
 
 ## 项目简介
-《完蛋！我被LLM包围了！》是一款智力挑战游戏。该项目利用gpt4, 基于ModelScope社区内现有的LLM对话Gradio应用程序代码，结合知乎文章[《如何用“不可能”完成任务》](https://zhuanlan.zhihu.com/p/665393240)中的预设问题，自动生成了对应的游戏代码，创造了一个独特的游戏体验。在这个有溪中，玩家需要巧妙构造问题，挑战LLM给出满足特定条件的回答。
+《完蛋！我被LLM包围了！》是一款智力挑战游戏。该项目利用gpt4, 基于ModelScope社区内现有的LLM对话Gradio应用程序代码，结合知乎文章[《如何用“不可能”完成任务》](https://zhuanlan.zhihu.com/p/665393240)中的预设问题，自动生成了对应的游戏代码，创造了一个独特的游戏体验。在这个游戏中，玩家需要巧妙构造问题，挑战LLM给出满足特定条件的回答。
 
 ## 开始游戏