Merge pull request #719 from modelscope/master-merge-internal20240110

Master merge internal20240110
2025-12-16 16:27:45 +01:00 · 2024-01-15 11:39:38 +08:00
parent 49c04ea47e 59760b123b
commit cc96a75901
19 changed files with 676 additions and 18 deletions
--- a/.dev_scripts/build_base_image.sh
+++ b/.dev_scripts/build_base_image.sh
@@ -4,7 +4,7 @@ BASE_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu
 BASE_GPU_CUDA113_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
 BASE_GPU_CUDA117_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.7.1-cudnn8-devel
 BASE_GPU_CUDA118_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.8.0-cudnn8-devel
-BASE_GPU_CUDA121_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:22.04-cuda11.8.0-cudnn8-devel
+BASE_GPU_CUDA121_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:22.04-cuda12.1.0-cudnn8-devel
 BASE_GPU_CUDA122_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:22.04-cuda11.2.2-cudnn8-devel
 MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope
 python_version=3.7.13
--- a/.dev_scripts/build_image.sh
+++ b/.dev_scripts/build_image.sh
@@ -160,10 +160,11 @@ export TORCH_VERSION=$torch_version
 export CUDATOOLKIT_VERSION=$cudatoolkit_version
 export TENSORFLOW_VERSION=$tensorflow_version
 echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\nis_ci:$is_ci_test\nis_dsw:$is_dsw\n"
+echo -e "Base iamge: $BASE_IMAGE"
 docker_file_content=`cat docker/Dockerfile.ubuntu`
 if [ "$is_ci_test" != "True" ]; then
    echo "Building ModelScope lib, will install ModelScope lib to image"
-    docker_file_content="${docker_file_content} \nRUN export COMMIT_ID=$CIS_ENV_COMMIT_ID && pip install --no-cache-dir -U adaseq pai-easycv ms_swift funasr 'transformers<4.35.0'"
+    docker_file_content="${docker_file_content} \nRUN export COMMIT_ID=$CIS_ENV_COMMIT_ID && pip install --no-cache-dir -U adaseq pai-easycv ms_swift funasr 'transformers==4.36.2'"
    docker_file_content="${docker_file_content} \nRUN pip uninstall modelscope -y && export COMMIT_ID=$CIS_ENV_COMMIT_ID && cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b $CIS_ENV_BRANCH  --single-branch $REPO_URL && cd MaaS-lib && pip install . && cd / && rm -fr /tmp/MaaS-lib"
        MMCV_WITH_OPS=1 MAX_JOBS=32 pip install --no-cache-dir 'mmcv-full<=1.7.0' && pip cache purge; \
 fi
@@ -174,8 +175,15 @@ else
    echo "Building dsw image will need set ModelScope lib cache location."
    docker_file_content="${docker_file_content} \nENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope"
    # pre compile extension
-    docker_file_content="${docker_file_content} \nRUN export TORCH_CUDA_ARCH_LIST='6.0;6.1;7.0;7.5;8.0;8.9;9.0;8.6+PTX' && python -c 'from modelscope.utils.pre_compile import pre_compile_all;pre_compile_all()'"
+    docker_file_content="${docker_file_content} \nRUN pip uninstall -y tb-nightly && pip install --no-cache-dir -U tensorboard && TORCH_CUDA_ARCH_LIST='6.0 6.1 7.0 7.5 8.0 8.9 9.0 8.6+PTX' python -c 'from modelscope.utils.pre_compile import pre_compile_all;pre_compile_all()'"
 fi
+# install here for easycv extension conflict.
+docker_file_content="${docker_file_content} \nRUN if [ \"$USE_GPU\" = \"True\" ] ; then \
+        bash /tmp/install_tiny_cuda_nn.sh; \
+    else \
+     echo 'cpu unsupport tiny_cuda_nn'; \
+    fi"
+
 if [ "$is_ci_test" == "True" ]; then
    echo "Building CI image, uninstall modelscope"
    docker_file_content="${docker_file_content} \nRUN pip uninstall modelscope -y"
@@ -189,7 +197,7 @@ printf "$docker_file_content" > Dockerfile

 while true
 do
-  DOCKER_BUILDKIT=0 docker build -t $IMAGE_TO_BUILD  \
+  docker build --progress=plain -t $IMAGE_TO_BUILD  \
             --build-arg USE_GPU \
             --build-arg BASE_IMAGE \
             --build-arg PYTHON_VERSION \
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -1,10 +1,11 @@
 ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-base
 FROM $BASE_IMAGE
 RUN apt-get update && \
-    apt-get install -y libsox-dev unzip  zip iputils-ping telnet && \
+    apt-get install -y libsox-dev unzip  zip iputils-ping telnet sudo && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

+ARG CUDA_VERSION=cu121
 # install  jupyter plugin
 RUN mkdir -p /root/.local/share/jupyter/labextensions/ && \
    cp -r  /tmp/resources/jupyter_plugins/*  /root/.local/share/jupyter/labextensions/
@@ -35,9 +36,9 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
 # torchmetrics==0.11.4 for ofa
 RUN if [ "$USE_GPU" = "True" ] ; then \
    pip install --no-cache-dir torchsde jupyterlab torchmetrics==0.11.4 tiktoken transformers_stream_generator bitsandbytes basicsr optimum && \
-    pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ && \
-    pip install --no-cache-dir -U xformers --index-url https://download.pytorch.org/whl/cu118 && \
-    pip install --no-cache-dir flash_attn==2.3.3+torch2.1cu118 tinycudann==1.7+cu118 vllm==0.2.1+cu118torch2.1 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
+    pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/ && \
+    pip install --no-cache-dir -U xformers --index-url https://download.pytorch.org/whl/cu121 && \
+    pip install --no-cache-dir -U flash_attn vllm; \
    else \
        echo 'cpu unsupport vllm auto-gptq'; \
    fi
@@ -51,6 +52,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/science.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/tests.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/svr.txt && \
    pip cache purge

 COPY examples /modelscope/examples
--- a/docker/Dockerfile.ubuntu_base
+++ b/docker/Dockerfile.ubuntu_base
@@ -117,7 +117,7 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
    fi

 RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/packages/mmcv_full-1.7.0-cp310-cp310-linux_x86_64.whl; \
+        pip install --no-cache-dir mmcv-full==1.7.0+torch2.1.1cu121 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
    else \
        pip install --no-cache-dir mmcv_full==1.7.0+torch2.1cpu -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
    fi
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -209,6 +209,8 @@ class Models(object):
    cluster_backend = 'cluster-backend'
    rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv'
    generic_lm = 'generic-lm'
+    audio_quantization = 'audio-quantization'
+    laura_codec = 'laura-codec'
    funasr = 'funasr'

    # multi-modal models
@@ -550,6 +552,9 @@ class Pipelines(object):
    segmentation_clustering = 'segmentation-clustering'
    lm_inference = 'language-score-prediction'
    speech_timestamp_inference = 'speech-timestamp-inference'
+    audio_quantization = 'audio-quantization'
+    audio_quantization_inference = 'audio-quantization-inference'
+    laura_codec_tts_inference = 'laura-codec-tts-inference'

    # multi-modal tasks
    image_captioning = 'image-captioning'
--- a/modelscope/models/audio/quantization/init.py
+++ b/modelscope/models/audio/quantization/init.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .generic_audio_quantization import GenericAudioQuantization
+
+else:
+    _import_structure = {
+        'generic_audio_quantization': ['GenericAudioQuantization'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
--- a/modelscope/models/audio/quantization/generic_audio_quantization.py
+++ b/modelscope/models/audio/quantization/generic_audio_quantization.py
@@ -0,0 +1,45 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Frameworks, Tasks
+
+__all__ = ['GenericAudioQuantization']
+
+
+@MODELS.register_module(
+    Tasks.audio_quantization, module_name=Models.audio_quantization)
+class GenericAudioQuantization(Model):
+
+    def __init__(self, model_dir: str, model_name: str,
+                 model_config: Dict[str, Any], *args, **kwargs):
+        """initialize the info of model.
+
+        Args:
+            model_dir (str): the model path.
+            model_name (str): the itn model name from configuration.json
+            model_config (Dict[str, Any]): the detail config about model from configuration.json
+        """
+        super().__init__(model_dir, model_name, model_config, *args, **kwargs)
+        self.model_cfg = {
+            # the recognition model dir path
+            'model_workspace': model_dir,
+            # the itn model name
+            'model_name': model_name,
+            # the am model file path
+            'model_path': os.path.join(model_dir, model_name),
+            # the recognition model config dict
+            'model_config': model_config
+        }
+
+    def forward(self) -> Dict[str, Any]:
+        """
+          just return the model config
+
+        """
+
+        return self.model_cfg
--- a/modelscope/models/audio/tts/init.py
+++ b/modelscope/models/audio/tts/init.py
@@ -5,9 +5,13 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .sambert_hifi import SambertHifigan
+    from .laura_codec import LauraCodecGenModel

 else:
-    _import_structure = {'sambert_hifi': ['SambertHifigan']}
+    _import_structure = {
+        'sambert_hifi': ['SambertHifigan'],
+        'laura_codec': ['LauraCodecGenModel'],
+    }
    import sys
    sys.modules[__name__] = LazyImportModule(
        __name__,
--- a/modelscope/models/audio/tts/laura_codec.py
+++ b/modelscope/models/audio/tts/laura_codec.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Frameworks, Tasks
+
+__all__ = ['LauraCodecGenModel']
+
+
+@MODELS.register_module(Tasks.text_to_speech, module_name=Models.laura_codec)
+class LauraCodecGenModel(Model):
+
+    def __init__(self, model_dir: str, model_name: str,
+                 model_config: Dict[str, Any], *args, **kwargs):
+        """initialize the info of model.
+
+        Args:
+            model_dir (str): the model path.
+            model_name (str): the itn model name from configuration.json
+            model_config (Dict[str, Any]): the detail config about model from configuration.json
+        """
+        super().__init__(model_dir, model_name, model_config, *args, **kwargs)
+        self.model_cfg = {
+            # the recognition model dir path
+            'model_workspace': model_dir,
+            # the itn model name
+            'model_name': model_name,
+            # the am model file path
+            'model_path': os.path.join(model_dir, model_name),
+            # the recognition model config dict
+            'model_config': model_config
+        }
+
+    def forward(self) -> Dict[str, Any]:
+        """
+          just return the model config
+
+        """
+
+        return self.model_cfg
--- a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_attention.py
+++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_attention.py
@@ -19,7 +19,7 @@ class DepthAttention(nn.Module):
                 output_bias=True):
        super().__init__()
        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
+        context_dim = attention.default(context_dim, query_dim)

        self.scale = dim_head**-0.5
        self.heads = heads
@@ -91,9 +91,10 @@ class DepthTransformer(nn.Module):
            nn.Conv2d(inner_dim, inner_dim, 3, 1, 1, bias=False),
            nn.GroupNorm(8, inner_dim),
            nn.ReLU(True),
-            zero_module(nn.Conv2d(inner_dim, dim, 3, 1, 1, bias=False)),
+            attention.zero_module(
+                nn.Conv2d(inner_dim, dim, 3, 1, 1, bias=False)),
        )
-        self.checkpoint = checkpoint
+        self.checkpoint = attention.checkpoint

    def forward(self, x, context=None):
        return checkpoint(self._forward, (x, context), self.parameters(),
--- a/modelscope/pipelines/audio/audio_quantization_pipeline.py
+++ b/modelscope/pipelines/audio/audio_quantization_pipeline.py
@@ -0,0 +1,229 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+from typing import Any, Dict, List, Sequence, Tuple, Union
+
+import numpy as np
+import yaml
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
+                                                update_local_model)
+from modelscope.utils.constant import Frameworks, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['AudioQuantizationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.audio_quantization,
+    module_name=Pipelines.audio_quantization_inference)
+class AudioQuantizationPipeline(Pipeline):
+    """Audio Quantization Inference Pipeline
+    use `model` to create a audio quantization pipeline.
+
+    Args:
+        model (AudioQuantizationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the preprocessor's constructor.
+    Examples:
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.utils.constant import Tasks
+        >>> pipeline_aq = pipeline(
+        >>>    task=Tasks.audio_quantization,
+        >>>    model='damo/audio_codec-encodec-zh_en-general-16k-nq32ds640-pytorch'
+        >>> )
+        >>> audio_in='example.wav'
+        >>> print(pipeline_aq(audio_in))
+
+    """
+
+    def __init__(self,
+                 model: Union[Model, str] = None,
+                 ngpu: int = 1,
+                 **kwargs):
+        """use `model` to create an asr pipeline for prediction
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_cfg = self.model.forward()
+        self.cmd = self.get_cmd(kwargs, model)
+
+        from funcodec.bin import codec_inference
+        self.funasr_infer_modelscope = codec_inference.inference_modelscope(
+            mode=self.cmd['mode'],
+            output_dir=self.cmd['output_dir'],
+            batch_size=self.cmd['batch_size'],
+            dtype=self.cmd['dtype'],
+            ngpu=ngpu,
+            seed=self.cmd['seed'],
+            num_workers=self.cmd['num_workers'],
+            log_level=self.cmd['log_level'],
+            key_file=self.cmd['key_file'],
+            config_file=self.cmd['config_file'],
+            model_file=self.cmd['model_file'],
+            model_tag=self.cmd['model_tag'],
+            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
+            streaming=self.cmd['streaming'],
+            sampling_rate=self.cmd['sampling_rate'],
+            bit_width=self.cmd['bit_width'],
+            use_scale=self.cmd['use_scale'],
+            param_dict=self.cmd['param_dict'],
+            **kwargs,
+        )
+
+    def __call__(self,
+                 audio_in: Union[tuple, str, Any] = None,
+                 output_dir: str = None,
+                 param_dict: dict = None) -> Dict[str, Any]:
+        if len(audio_in) == 0:
+            raise ValueError('The input should not be null.')
+        else:
+            self.audio_in = audio_in
+        if output_dir is not None:
+            self.cmd['output_dir'] = output_dir
+        self.cmd['param_dict'] = param_dict
+
+        output = self.forward(self.audio_in)
+        result = self.postprocess(output)
+        return result
+
+    def postprocess(self, inputs: list) -> Dict[str, Any]:
+        """Postprocessing
+        """
+        rst = {}
+        for i in range(len(inputs)):
+            if len(inputs) == 1 and i == 0:
+                recon_wav = inputs[0]['value']
+                output_wav = recon_wav.cpu().numpy()[0]
+                output_wav = (output_wav * (2**15)).astype(np.int16)
+                rst[OutputKeys.OUTPUT_WAV] = output_wav
+            else:
+                # for multiple inputs
+                rst[inputs[i]['key']] = inputs[i]['value']
+        return rst
+
+    def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
+        # generate asr inference command
+        mode = self.model_cfg['model_config']['mode']
+        _model_path = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['model_file'])
+        _model_config = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['config_file'])
+        update_local_model(self.model_cfg['model_config'], model_path,
+                           extra_args)
+        cmd = {
+            'mode': mode,
+            'output_dir': None,
+            'batch_size': 1,
+            'dtype': 'float32',
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'seed': 0,
+            'num_workers': 0,
+            'log_level': 'ERROR',
+            'key_file': None,
+            'model_file': _model_path,
+            'config_file': _model_config,
+            'model_tag': None,
+            'allow_variable_data_keys': True,
+            'streaming': False,
+            'sampling_rate': 16000,
+            'bit_width': 8000,
+            'use_scale': True,
+            'param_dict': None,
+        }
+        user_args_dict = [
+            'output_dir',
+            'batch_size',
+            'ngpu',
+            'log_level',
+            'allow_variable_data_keys',
+            'streaming',
+            'num_workers',
+            'sampling_rate',
+            'bit_width',
+            'use_scale',
+            'param_dict',
+        ]
+
+        # re-write the config with configure.json
+        for user_args in user_args_dict:
+            if (user_args in self.model_cfg['model_config']
+                    and self.model_cfg['model_config'][user_args] is not None):
+                if isinstance(cmd[user_args], dict) and isinstance(
+                        self.model_cfg['model_config'][user_args], dict):
+                    cmd[user_args].update(
+                        self.model_cfg['model_config'][user_args])
+                else:
+                    cmd[user_args] = self.model_cfg['model_config'][user_args]
+
+        # rewrite the config with user args
+        for user_args in user_args_dict:
+            if user_args in extra_args:
+                if extra_args.get(user_args) is not None:
+                    if isinstance(cmd[user_args], dict) and isinstance(
+                            extra_args[user_args], dict):
+                        cmd[user_args].update(extra_args[user_args])
+                    else:
+                        cmd[user_args] = extra_args[user_args]
+                del extra_args[user_args]
+
+        return cmd
+
+    def forward(self, audio_in: Union[tuple, str, Any] = None) -> list:
+        """Decoding
+        """
+        # log  file_path/url or tuple (str, str)
+        if isinstance(audio_in, str):
+            logger.info(f'Audio Quantization Processing: {audio_in} ...')
+        else:
+            logger.info(
+                f'Audio Quantization Processing: {str(audio_in)[:100]} ...')
+
+        data_cmd, raw_inputs = None, None
+        if isinstance(audio_in, str):
+            # for scp inputs
+            if len(audio_in.split(',')) == 3:
+                data_cmd = [tuple(audio_in.split(','))]
+            # for single-file inputs
+            else:
+                audio_scp, _ = generate_scp_from_url(audio_in)
+                raw_inputs = audio_scp
+        # for raw bytes
+        elif isinstance(audio_in, bytes):
+            data_cmd = (audio_in, 'speech', 'bytes')
+        # for ndarray and tensor inputs
+        else:
+            import torch
+            import numpy as np
+            if isinstance(audio_in, torch.Tensor):
+                raw_inputs = audio_in
+            elif isinstance(audio_in, np.ndarray):
+                raw_inputs = audio_in
+            else:
+                raise TypeError('Unsupported data type.')
+
+        self.cmd['name_and_type'] = data_cmd
+        self.cmd['raw_inputs'] = raw_inputs
+        result = self.run_inference(self.cmd)
+
+        return result
+
+    def run_inference(self, cmd):
+        if self.framework == Frameworks.torch:
+            sv_result = self.funasr_infer_modelscope(
+                data_path_and_name_and_type=cmd['name_and_type'],
+                raw_inputs=cmd['raw_inputs'],
+                output_dir_v2=cmd['output_dir'],
+                param_dict=cmd['param_dict'])
+        else:
+            raise ValueError('model type is mismatching')
+
+        return sv_result
--- a/modelscope/pipelines/audio/codec_based_synthesis_pipeline.py
+++ b/modelscope/pipelines/audio/codec_based_synthesis_pipeline.py
@@ -0,0 +1,276 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict, Optional, Union
+
+import json
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
+                                                update_local_model)
+from modelscope.utils.constant import Frameworks, ModelFile, Tasks
+from modelscope.utils.hub import snapshot_download
+from modelscope.utils.logger import get_logger
+
+__all__ = ['LauraCodecTTSPipeline']
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.text_to_speech, module_name=Pipelines.laura_codec_tts_inference)
+class LauraCodecTTSPipeline(Pipeline):
+    """Laura-style Codec-based TTS Inference Pipeline
+    use `model` to create a TTS pipeline.
+
+    Args:
+        model (LauraCodecTTSPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the preprocessor's constructor.
+    Examples:
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.utils.constant import Tasks
+        >>> my_pipeline = pipeline(
+        >>>    task=Tasks.text_to_speech,
+        >>>    model='damo/speech_synthesizer-laura-en-libritts-16k-codec_nq2-pytorch'
+        >>> )
+        >>> text='nothing was to be done but to put about, and return in disappointment towards the north.'
+        >>> prompt_text='one of these is context'
+        >>> prompt_speech='example/prompt.wav'
+        >>> print(my_pipeline(text))
+
+    """
+
+    def __init__(self,
+                 model: Union[Model, str] = None,
+                 codec_model: Optional[Union[Model, str]] = None,
+                 codec_model_revision: Optional[str] = None,
+                 ngpu: int = 1,
+                 **kwargs):
+        """use `model` to create an asr pipeline for prediction
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_cfg = self.model.forward()
+        self.codec_model = codec_model
+        self.codec_model_revision = codec_model_revision
+        self.cmd = self.get_cmd(kwargs, model)
+
+        from funcodec.bin import text2audio_inference
+        self.funasr_infer_modelscope = text2audio_inference.inference_func(
+            mode=self.cmd['mode'],
+            output_dir=self.cmd['output_dir'],
+            batch_size=self.cmd['batch_size'],
+            dtype=self.cmd['dtype'],
+            ngpu=ngpu,
+            seed=self.cmd['seed'],
+            num_workers=self.cmd['num_workers'],
+            log_level=self.cmd['log_level'],
+            key_file=self.cmd['key_file'],
+            config_file=self.cmd['config_file'],
+            model_file=self.cmd['model_file'],
+            model_tag=self.cmd['model_tag'],
+            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
+            streaming=self.cmd['streaming'],
+            text_emb_model=self.cmd['text_emb_model'],
+            beam_size=self.cmd['beam_size'],
+            sampling=self.cmd['sampling'],
+            continual=self.cmd['continual'],
+            tokenize_to_phone=self.cmd['tokenize_to_phone'],
+            exclude_prompt=self.cmd['exclude_prompt'],
+            codec_config_file=self.cmd['codec_config_file'],
+            codec_model_file=self.cmd['codec_model_file'],
+            param_dict=self.cmd['param_dict'])
+
+    def __call__(self,
+                 text: Union[tuple, str, Any] = None,
+                 prompt_text: Union[tuple, str, Any] = None,
+                 prompt_audio: Union[tuple, str, Any] = None,
+                 output_dir: str = None,
+                 param_dict: dict = None) -> Dict[str, Any]:
+        if len(text) == 0:
+            raise ValueError('The input should not be null.')
+        if output_dir is not None:
+            self.cmd['output_dir'] = output_dir
+        self.cmd['param_dict'] = param_dict
+
+        output = self.forward(text, prompt_text, prompt_audio)
+        result = self.postprocess(output)
+        return result
+
+    def postprocess(self, inputs: list) -> Dict[str, Any]:
+        """Postprocessing
+        """
+        rst = {}
+        for i in range(len(inputs)):
+            if len(inputs) == 1 and i == 0:
+                recon_wav = inputs[0]['value']['gen']
+                rst[OutputKeys.OUTPUT_WAV] = recon_wav.cpu().numpy()[0]
+            else:
+                # for multiple inputs
+                rst[inputs[i]['key']] = inputs[i]['value']['gen']
+        return rst
+
+    def load_codec_model(self, cmd):
+        if self.codec_model is not None and self.codec_model != '':
+            if os.path.exists(self.codec_model):
+                codec_model = self.codec_model
+            else:
+                codec_model = snapshot_download(
+                    self.codec_model, revision=self.codec_model_revision)
+            logger.info('loading codec model from {0} ...'.format(codec_model))
+            config_path = os.path.join(codec_model, ModelFile.CONFIGURATION)
+            model_cfg = json.loads(open(config_path).read())
+            model_dir = os.path.dirname(config_path)
+            cmd['codec_model_file'] = os.path.join(
+                model_dir, model_cfg['model']['model_config']['model_file'])
+            cmd['codec_config_file'] = os.path.join(
+                model_dir, model_cfg['model']['model_config']['config_file'])
+
+    def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
+        # generate asr inference command
+        mode = self.model_cfg['model_config']['mode']
+        _model_path = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['model_file'])
+        _model_config = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['config_file'])
+        update_local_model(self.model_cfg['model_config'], model_path,
+                           extra_args)
+
+        cmd = {
+            'mode': mode,
+            'output_dir': None,
+            'batch_size': 1,
+            'dtype': 'float32',
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'seed': 0,
+            'num_workers': 0,
+            'log_level': 'ERROR',
+            'key_file': None,
+            'model_file': _model_path,
+            'config_file': _model_config,
+            'model_tag': None,
+            'allow_variable_data_keys': True,
+            'streaming': False,
+            'beam_size': 1,
+            'sampling': 25,
+            'text_emb_model': None,
+            'continual': True,
+            'tokenize_to_phone': True,
+            'exclude_prompt': True,
+            'codec_model_file': None,
+            'codec_config_file': None,
+            'param_dict': None,
+        }
+        user_args_dict = [
+            'output_dir',
+            'batch_size',
+            'ngpu',
+            'log_level',
+            'allow_variable_data_keys',
+            'streaming',
+            'num_workers',
+            'sampling_rate',
+            'bit_width',
+            'use_scale',
+            'param_dict',
+        ]
+
+        model_config = self.model_cfg['model_config']
+        if model_config.__contains__(
+                'codec_model') and self.codec_model is None:
+            self.codec_model = model_config['codec_model']
+        if model_config.__contains__(
+                'codec_model_revision') and self.codec_model_revision is None:
+            self.codec_model_revision = model_config['codec_model_revision']
+        self.load_codec_model(cmd)
+
+        # re-write the config with configure.json
+        for user_args in user_args_dict:
+            if (user_args in self.model_cfg['model_config']
+                    and self.model_cfg['model_config'][user_args] is not None):
+                if isinstance(cmd[user_args], dict) and isinstance(
+                        self.model_cfg['model_config'][user_args], dict):
+                    cmd[user_args].update(
+                        self.model_cfg['model_config'][user_args])
+                else:
+                    cmd[user_args] = self.model_cfg['model_config'][user_args]
+
+        # rewrite the config with user args
+        for user_args in user_args_dict:
+            if user_args in extra_args:
+                if extra_args.get(user_args) is not None:
+                    if isinstance(cmd[user_args], dict) and isinstance(
+                            extra_args[user_args], dict):
+                        cmd[user_args].update(extra_args[user_args])
+                    else:
+                        cmd[user_args] = extra_args[user_args]
+                del extra_args[user_args]
+
+        return cmd
+
+    def forward(self,
+                text: Union[tuple, str, Any] = None,
+                prompt_text: Union[tuple, str, Any] = None,
+                prompt_audio: Union[tuple, str, Any] = None,
+                **forward_params) -> list:
+        """Decoding
+        """
+        if isinstance(text, str):
+            logger.info(f'Generate speech for: {text} ...')
+
+        data_cmd, raw_inputs = None, None
+        # process text input
+        # for scp inputs
+        if len(text.split(',')) == 3:
+            data_cmd = [tuple(text.split(','))]
+        # for single-file inputs
+        else:
+            raw_inputs = [text]
+
+        if prompt_text is not None and prompt_audio is not None:
+            if len(prompt_text.split(',')) == 3:
+                data_cmd.append(tuple(prompt_text.split(',')))
+            else:
+                raw_inputs.append(prompt_text)
+
+            if isinstance(prompt_audio, str):
+                if len(prompt_audio.split(',')) == 3:
+                    data_cmd.append(tuple(prompt_audio.split(',')))
+                else:
+                    audio_path, _ = generate_scp_from_url(prompt_audio)
+                    raw_inputs.append(audio_path)
+            # for ndarray and tensor inputs
+            else:
+                import torch
+                if isinstance(prompt_audio, torch.Tensor):
+                    raw_inputs.append(prompt_audio.numpy())
+                elif isinstance(prompt_audio, np.ndarray):
+                    raw_inputs.append(prompt_audio)
+                else:
+                    raise TypeError(
+                        f'Unsupported prompt audio type {type(prompt_audio)}.')
+
+        self.cmd['name_and_type'] = data_cmd
+        self.cmd['raw_inputs'] = raw_inputs
+        result = self.run_inference(self.cmd)
+
+        return result
+
+    def run_inference(self, cmd):
+        if self.framework == Frameworks.torch:
+            sv_result = self.funasr_infer_modelscope(
+                data_path_and_name_and_type=cmd['name_and_type'],
+                raw_inputs=cmd['raw_inputs'],
+                output_dir_v2=cmd['output_dir'],
+                param_dict=cmd['param_dict'])
+        else:
+            raise ValueError('model type is mismatching')
+
+        return sv_result
--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -1,16 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

-from typing import Any, Dict, List
+from typing import Any, Dict

 import numpy as np

 from modelscope.metainfo import Pipelines
-from modelscope.models import Model
 from modelscope.models.audio.tts import SambertHifigan
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, InputModel, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.constant import Tasks

 __all__ = ['TextToSpeechSambertHifiganPipeline']

--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -246,6 +246,7 @@ class AudioTasks(object):
    speaker_verification = 'speaker-verification'
    speech_language_recognition = 'speech-language-recognition'
    speaker_diarization = 'speaker-diarization'
+    audio_quantization = 'audio-quantization'
    voice_activity_detection = 'voice-activity-detection'
    language_score_prediction = 'language-score-prediction'
    speech_timestamp = 'speech-timestamp'
--- a/modelscope/utils/pipeline_schema.json
+++ b/modelscope/utils/pipeline_schema.json
@@ -137,6 +137,27 @@
            }
        }
    },
+    "audio-quantization": {
+        "input": {
+            "type": "object",
+            "properties": {
+                "wav": {
+                    "type": "string",
+                    "description": "Base64 encoded audio file or url string.."
+                }
+            }
+        },
+        "parameters": {},
+        "output": {
+            "type": "object",
+            "properties": {
+                "output_wav": {
+                    "type": "string",
+                    "description": "The base64 encoded WAV."
+                }
+            }
+        }
+    },
    "bad-image-detecting": {
        "input": {
            "type": "object",
--- a/modelscope/utils/pre_compile.py
+++ b/modelscope/utils/pre_compile.py
@@ -20,7 +20,6 @@ def pre_compile_all():
    if torch.cuda.is_available():  # extension require cuda.
        # pre compile pai-easycv
        from easycv.thirdparty.deformable_attention.functions import ms_deform_attn_func
-        pre_compile_megatron_util()
        # extension for all platform.
        pre_compile_megatron_util()

--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -2,3 +2,4 @@
 -r audio/audio_kws.txt
 -r audio/audio_signal.txt
 -r audio/audio_tts.txt
+-r audio/audio_codec.txt
--- a/requirements/audio/audio_codec.txt
+++ b/requirements/audio/audio_codec.txt
@@ -0,0 +1 @@
+funcodec>=0.2.0
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -1,7 +1,7 @@
 accelerate
 cloudpickle
 decord>=0.6.0
-diffusers>=0.19.0
+diffusers>=0.25.0
 fairseq
 ftfy>=6.0.3
 librosa==0.10.1