update funasr1.0 (#715)

* funasr1.0 modelscope * fix lint issue --------- Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
2025-12-16 08:17:45 +01:00 · 2024-01-12 12:02:01 +08:00
parent 383a4dc44f
commit 49c04ea47e
19 changed files with 158 additions and 2288 deletions
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -209,6 +209,7 @@ class Models(object):
    cluster_backend = 'cluster-backend'
    rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv'
    generic_lm = 'generic-lm'
    funasr = 'funasr'
    # multi-modal models
    ofa = 'ofa'
@@ -533,11 +534,8 @@ class Pipelines(object):
    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
    speech_separation = 'speech-separation'
    kws_kwsbp = 'kws-kwsbp'
    asr_inference = 'asr-inference'
    asr_wenet_inference = 'asr-wenet-inference'
    itn_inference = 'itn-inference'
    punc_inference = 'punc-inference'
    sv_inference = 'sv-inference'
    speaker_diarization_inference = 'speaker-diarization-inference'
    vad_inference = 'vad-inference'
    funasr_speech_separation = 'funasr-speech-separation'
@@ -591,6 +589,9 @@ class Pipelines(object):
    # science tasks
    protein_structure = 'unifold-protein-structure'
    # funasr task
    funasr_pipeline = 'funasr-pipeline'
 DEFAULT_MODEL_FOR_PIPELINE = {
    # TaskName: (pipeline_module_name, model_repo)
--- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
@@ -1,51 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Frameworks, Tasks
 __all__ = ['GenericAutomaticSpeechRecognition']
@MODELS.register_module(
    Tasks.auto_speech_recognition, module_name=Models.generic_asr)
@MODELS.register_module(
    Tasks.voice_activity_detection, module_name=Models.generic_asr)
@MODELS.register_module(
    Tasks.speech_separation, module_name=Models.generic_asr)
@MODELS.register_module(
    Tasks.language_score_prediction, module_name=Models.generic_asr)
@MODELS.register_module(Tasks.speech_timestamp, module_name=Models.generic_asr)
 class GenericAutomaticSpeechRecognition(Model):
    def __init__(self, model_dir: str, am_model_name: str,
                 model_config: Dict[str, Any], *args, **kwargs):
        """initialize the info of model.
        Args:
            model_dir (str): the model path.
            am_model_name (str): the am model name from configuration.json
            model_config (Dict[str, Any]): the detail config about model from configuration.json
        """
        super().__init__(model_dir, am_model_name, model_config, *args,
                         **kwargs)
        self.model_cfg = {
            # the recognition model dir path
            'model_workspace': model_dir,
            # the am model name
            'am_model': am_model_name,
            # the am model file path
            'am_model_path': os.path.join(model_dir, am_model_name),
            # the recognition model config dict
            'model_config': model_config
        }
    def forward(self) -> Dict[str, Any]:
        """preload model and return the info of the model
        """
        return self.model_cfg
--- a/modelscope/models/audio/funasr/init.py
+++ b/modelscope/models/audio/funasr/init.py
--- a/modelscope/models/audio/funasr/model.py
+++ b/modelscope/models/audio/funasr/model.py
@@ -0,0 +1,62 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 import json
 from funasr import AutoModel
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Frameworks, Tasks
 __all__ = ['GenericFunASR']
@MODELS.register_module(
    Tasks.auto_speech_recognition, module_name=Models.funasr)
@MODELS.register_module(
    Tasks.voice_activity_detection, module_name=Models.funasr)
@MODELS.register_module(
    Tasks.language_score_prediction, module_name=Models.funasr)
@MODELS.register_module(Tasks.punctuation, module_name=Models.funasr)
@MODELS.register_module(Tasks.speaker_diarization, module_name=Models.funasr)
@MODELS.register_module(Tasks.speaker_verification, module_name=Models.funasr)
@MODELS.register_module(Tasks.speech_separation, module_name=Models.funasr)
@MODELS.register_module(Tasks.speech_timestamp, module_name=Models.funasr)
@MODELS.register_module(Tasks.emotion_recognition, module_name=Models.funasr)
 class GenericFunASR(Model):
    def __init__(self, model_dir, *args, **kwargs):
        """initialize the info of model.
        Args:
            model_dir (str): the model path.
            am_model_name (str): the am model name from configuration.json
            model_config (Dict[str, Any]): the detail config about model from configuration.json
        """
        super().__init__(model_dir, *args, **kwargs)
        model_cfg = json.loads(
            open(os.path.join(model_dir, 'configuration.json')).read())
        if 'vad_model' not in kwargs and 'vad_model' in model_cfg:
            kwargs['vad_model'] = model_cfg['vad_model']
            kwargs['vad_model_revision'] = model_cfg.get(
                'vad_model_revision', None)
        if 'punc_model' not in kwargs and 'punc_model' in model_cfg:
            kwargs['punc_model'] = model_cfg['punc_model']
            kwargs['punc_model_revision'] = model_cfg.get(
                'punc_model_revision', None)
        if 'spk_model' not in kwargs and 'spk_model' in model_cfg:
            kwargs['spk_model'] = model_cfg['spk_model']
            kwargs['spk_model_revision'] = model_cfg.get(
                'spk_model_revision', None)
        self.model = AutoModel(model=model_dir, **kwargs)
    def forward(self, *args, **kwargs):
        """preload model and return the info of the model
        """
        output = self.model(*args, **kwargs)
        return output
--- a/modelscope/models/audio/punc/generic_punctuation.py
+++ b/modelscope/models/audio/punc/generic_punctuation.py
@@ -1,43 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Frameworks, Tasks
@MODELS.register_module(Tasks.punctuation, module_name=Models.generic_punc)
 class PunctuationProcessing(Model):
    def __init__(self, model_dir: str, punc_model_name: str,
                 punc_model_config: Dict[str, Any], *args, **kwargs):
        """initialize the info of model.
        Args:
            model_dir (str): the model path.
            punc_model_name (str): the itn model name from configuration.json
            punc_model_config (Dict[str, Any]): the detail config about model from configuration.json
        """
        super().__init__(model_dir, punc_model_name, punc_model_config, *args,
                         **kwargs)
        self.model_cfg = {
            # the recognition model dir path
            'model_workspace': model_dir,
            # the itn model name
            'punc_model': punc_model_name,
            # the am model file path
            'punc_model_path': os.path.join(model_dir, punc_model_name),
            # the recognition model config dict
            'model_config': punc_model_config
        }
    def forward(self) -> Dict[str, Any]:
        """
          just return the model config
        """
        return self.model_cfg
--- a/modelscope/models/audio/sv/generic_speaker_verification.py
+++ b/modelscope/models/audio/sv/generic_speaker_verification.py
@@ -1,45 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Frameworks, Tasks
@MODELS.register_module(
    Tasks.speaker_verification, module_name=Models.generic_sv)
@MODELS.register_module(
    Tasks.speaker_diarization, module_name=Models.generic_sv)
 class SpeakerVerification(Model):
    def __init__(self, model_dir: str, model_name: str,
                 model_config: Dict[str, Any], *args, **kwargs):
        """initialize the info of model.
        Args:
            model_dir (str): the model path.
            model_name (str): the itn model name from configuration.json
            model_config (Dict[str, Any]): the detail config about model from configuration.json
        """
        super().__init__(model_dir, model_name, model_config, *args, **kwargs)
        self.model_cfg = {
            # the recognition model dir path
            'model_workspace': model_dir,
            # the itn model name
            'model_name': model_name,
            # the am model file path
            'model_path': os.path.join(model_dir, model_name),
            # the recognition model config dict
            'model_config': model_config
        }
    def forward(self) -> Dict[str, Any]:
        """
          just return the model config
        """
        return self.model_cfg
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -1,591 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 import json
 import yaml
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import WavToScp
 from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
                                                generate_scp_from_url,
                                                load_bytes_from_url,
                                                update_local_model)
 from modelscope.utils.constant import Frameworks, ModelFile, Tasks
 from modelscope.utils.hub import snapshot_download
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['AutomaticSpeechRecognitionPipeline']
@PIPELINES.register_module(
    Tasks.auto_speech_recognition, module_name=Pipelines.asr_inference)
 class AutomaticSpeechRecognitionPipeline(Pipeline):
    """ASR Inference Pipeline
    Example:
    >>> from modelscope.pipelines import pipeline
    >>> from modelscope.utils.constant import Tasks
    >>> inference_pipeline = pipeline(
    >>>     task=Tasks.auto_speech_recognition,
    >>>     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
    >>> rec_result = inference_pipeline(
    >>>     audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
    >>> print(rec_result)
    """
    def __init__(self,
                 model: Union[Model, str] = None,
                 preprocessor: WavToScp = None,
                 vad_model: Optional[Union[Model, str]] = None,
                 vad_model_revision: Optional[str] = None,
                 punc_model: Optional[Union[Model, str]] = None,
                 punc_model_revision: Optional[str] = None,
                 lm_model: Optional[Union[Model, str]] = None,
                 lm_model_revision: Optional[str] = None,
                 timestamp_model: Optional[Union[Model, str]] = None,
                 timestamp_model_revision: Optional[str] = None,
                 ngpu: int = 1,
                 **kwargs):
        """
        Use `model` and `preprocessor` to create an asr pipeline for prediction
        Args:
            model ('Model' or 'str'):
                The pipeline handles three types of model:
                - A model instance
                - A model local dir
                - A model id in the model hub
            preprocessor:
                (list of) Preprocessor object
            vad_model (Optional: 'Model' or 'str'):
                voice activity detection model from model hub or local
                example: 'damo/speech_fsmn_vad_zh-cn-16k-common-pytorch'
            punc_model (Optional: 'Model' or 'str'):
                punctuation model from model hub or local
                example: 'damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
            lm_model (Optional: 'Model' or 'str'):
                language model from model hub or local
                example: 'damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch'
            timestamp_model (Optional: 'Model' or 'str'):
                timestamp model from model hub or local
                example: 'damo/speech_timestamp_predictor-v1-16k-offline'
            output_dir('str'):
                output dir path
            batch_size('int'):
                the batch size for inference
            ngpu('int'):
                the number of gpus, 0 indicates CPU mode
            beam_size('int'):
                beam size for decoding
            ctc_weight('float'):
                the CTC weight in joint decoding
            lm_weight('float'):
                lm weight
            decoding_ind('int', defaults to 0):
                decoding ind
            decoding_mode('str', defaults to 'model1'):
                decoding mode
            vad_model_file('str'):
                vad model file
            vad_infer_config('str'):
                VAD infer configuration
            vad_cmvn_file('str'):
                global CMVN file
            punc_model_file('str'):
                punc model file
            punc_infer_config('str'):
                punc infer config
            param_dict('dict'):
                extra kwargs
        """
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.vad_model = vad_model
        self.vad_model_revision = vad_model_revision
        self.punc_model = punc_model
        self.punc_model_revision = punc_model_revision
        self.lm_model = lm_model
        self.lm_model_revision = lm_model_revision
        self.timestamp_model = timestamp_model
        self.timestamp_model_revision = timestamp_model_revision
        self.model_cfg = self.model.forward()
        self.cmd = self.get_cmd(kwargs, model)
        from funasr.bin import asr_inference_launch
        self.funasr_infer_modelscope = asr_inference_launch.inference_launch(
            mode=self.cmd['mode'],
            maxlenratio=self.cmd['maxlenratio'],
            minlenratio=self.cmd['minlenratio'],
            batch_size=self.cmd['batch_size'],
            beam_size=self.cmd['beam_size'],
            ngpu=ngpu,
            ctc_weight=self.cmd['ctc_weight'],
            lm_weight=self.cmd['lm_weight'],
            penalty=self.cmd['penalty'],
            log_level=self.cmd['log_level'],
            asr_train_config=self.cmd['asr_train_config'],
            asr_model_file=self.cmd['asr_model_file'],
            cmvn_file=self.cmd['cmvn_file'],
            lm_file=self.cmd['lm_file'],
            token_type=self.cmd['token_type'],
            key_file=self.cmd['key_file'],
            lm_train_config=self.cmd['lm_train_config'],
            bpemodel=self.cmd['bpemodel'],
            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
            output_dir=self.cmd['output_dir'],
            dtype=self.cmd['dtype'],
            seed=self.cmd['seed'],
            ngram_weight=self.cmd['ngram_weight'],
            nbest=self.cmd['nbest'],
            num_workers=self.cmd['num_workers'],
            vad_infer_config=self.cmd['vad_infer_config'],
            vad_model_file=self.cmd['vad_model_file'],
            vad_cmvn_file=self.cmd['vad_cmvn_file'],
            punc_model_file=self.cmd['punc_model_file'],
            punc_infer_config=self.cmd['punc_infer_config'],
            timestamp_model_file=self.cmd['timestamp_model_file'],
            timestamp_infer_config=self.cmd['timestamp_infer_config'],
            timestamp_cmvn_file=self.cmd['timestamp_cmvn_file'],
            outputs_dict=self.cmd['outputs_dict'],
            param_dict=self.cmd['param_dict'],
            token_num_relax=self.cmd['token_num_relax'],
            decoding_ind=self.cmd['decoding_ind'],
            decoding_mode=self.cmd['decoding_mode'],
            fake_streaming=self.cmd['fake_streaming'],
            model_lang=self.cmd['model_lang'],
            **kwargs,
        )
    def __call__(self,
                 audio_in: Union[str, bytes],
                 audio_fs: int = None,
                 recog_type: str = None,
                 audio_format: str = None,
                 output_dir: str = None,
                 param_dict: dict = None,
                 **kwargs) -> Dict[str, Any]:
        from funasr.utils import asr_utils
        """
        Decoding the input audios
        Args:
            audio_in('str' or 'bytes'):
                - A string containing a local path to a wav file
                - A string containing a local path to a scp
                - A string containing a wav url
                - A bytes input
            audio_fs('int'):
                frequency of sample
            recog_type('str'):
                recog type
            audio_format('str'):
                audio format
            output_dir('str'):
                output dir
            param_dict('dict'):
                extra kwargs
        Return:
            A dictionary of result or a list of dictionary of result.
            The dictionary contain the following keys:
            - **text** ('str') --The asr result.
        """
        # code base
        # code_base = self.cmd['code_base']
        self.recog_type = recog_type
        self.audio_format = audio_format
        self.audio_fs = None
        checking_audio_fs = None
        self.raw_inputs = None
        if output_dir is not None:
            self.cmd['output_dir'] = output_dir
        self.cmd['param_dict'] = param_dict
        if isinstance(audio_in, str):
            # for funasr code, generate wav.scp from url or local path
            if audio_in.startswith('http') or os.path.isfile(audio_in):
                self.audio_in, self.raw_inputs = generate_scp_from_url(
                    audio_in)
            else:
                raise FileNotFoundError(
                    f'file {audio_in} NOT FOUND, please CHECK!')
        elif isinstance(audio_in, bytes):
            self.audio_in = audio_in
            self.raw_inputs = None
        else:
            import numpy
            import torch
            if isinstance(audio_in, torch.Tensor):
                self.audio_in = None
                self.raw_inputs = audio_in
            elif isinstance(audio_in, numpy.ndarray):
                self.audio_in = None
                self.raw_inputs = audio_in
        # set the sample_rate of audio_in if checking_audio_fs is valid
        if checking_audio_fs is not None:
            self.audio_fs = checking_audio_fs
        if recog_type is None or audio_format is None:
            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
                audio_in=self.audio_in,
                recog_type=recog_type,
                audio_format=audio_format)
        if hasattr(asr_utils,
                   'sample_rate_checking') and self.audio_in is not None:
            checking_audio_fs = asr_utils.sample_rate_checking(
                self.audio_in, self.audio_format)
            if checking_audio_fs is not None:
                self.audio_fs = checking_audio_fs
        if audio_fs is not None:
            self.cmd['fs']['audio_fs'] = audio_fs
        else:
            self.cmd['fs']['audio_fs'] = self.audio_fs
        output = self.preprocessor.forward(self.model_cfg, self.recog_type,
                                           self.audio_format, self.audio_in,
                                           self.audio_fs, self.cmd)
        output = self.forward(output, **kwargs)
        rst = self.postprocess(output)
        return rst
    def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
        if self.preprocessor is None:
            self.preprocessor = WavToScp()
        outputs = self.preprocessor.config_checking(self.model_cfg)
        # generate asr inference command
        cmd = {
            'maxlenratio': 0.0,
            'minlenratio': 0.0,
            'batch_size': 1,
            'beam_size': 1,
            'ngpu': 1,
            'ctc_weight': 0.0,
            'lm_weight': 0.0,
            'penalty': 0.0,
            'log_level': 'ERROR',
            'asr_train_config': None,
            'asr_model_file': outputs['am_model_path'],
            'cmvn_file': None,
            'lm_train_config': None,
            'lm_file': None,
            'token_type': None,
            'key_file': None,
            'word_lm_train_config': None,
            'bpemodel': None,
            'allow_variable_data_keys': False,
            'output_dir': None,
            'dtype': 'float32',
            'seed': 0,
            'ngram_weight': 0.9,
            'nbest': 1,
            'num_workers': 0,
            'vad_infer_config': None,
            'vad_model_file': None,
            'vad_cmvn_file': None,
            'time_stamp_writer': True,
            'punc_infer_config': None,
            'punc_model_file': None,
            'timestamp_infer_config': None,
            'timestamp_model_file': None,
            'timestamp_cmvn_file': None,
            'outputs_dict': True,
            'param_dict': None,
            'model_type': outputs['model_type'],
            'idx_text': '',
            'sampled_ids': 'seq2seq/sampled_ids',
            'sampled_lengths': 'seq2seq/sampled_lengths',
            'model_lang': outputs['model_lang'],
            'code_base': outputs['code_base'],
            'mode': outputs['mode'],
            'fs': {
                'model_fs': None,
                'audio_fs': None
            },
            'fake_streaming': False,
        }
        frontend_conf = None
        token_num_relax = None
        decoding_ind = None
        decoding_mode = None
        fake_streaming = False
        if os.path.exists(outputs['am_model_config']):
            config_file = open(outputs['am_model_config'], encoding='utf-8')
            root = yaml.full_load(config_file)
            config_file.close()
            if 'frontend_conf' in root:
                frontend_conf = root['frontend_conf']
        if os.path.exists(outputs['asr_model_config']):
            config_file = open(outputs['asr_model_config'], encoding='utf-8')
            root = yaml.full_load(config_file)
            config_file.close()
            if 'token_num_relax' in root:
                token_num_relax = root['token_num_relax']
            if 'decoding_ind' in root:
                decoding_ind = root['decoding_ind']
            if 'decoding_mode' in root:
                decoding_mode = root['decoding_mode']
            cmd['beam_size'] = root['beam_size']
            cmd['penalty'] = root['penalty']
            cmd['maxlenratio'] = root['maxlenratio']
            cmd['minlenratio'] = root['minlenratio']
            cmd['ctc_weight'] = root['ctc_weight']
            cmd['lm_weight'] = root['lm_weight']
        cmd['asr_train_config'] = outputs['am_model_config']
        cmd['lm_file'] = outputs['lm_model_path']
        cmd['lm_train_config'] = outputs['lm_model_config']
        cmd['batch_size'] = outputs['model_config']['batch_size']
        cmd['frontend_conf'] = frontend_conf
        if frontend_conf is not None and 'fs' in frontend_conf:
            cmd['fs']['model_fs'] = frontend_conf['fs']
        cmd['token_num_relax'] = token_num_relax
        cmd['decoding_ind'] = decoding_ind
        cmd['decoding_mode'] = decoding_mode
        cmd['fake_streaming'] = fake_streaming
        if outputs.__contains__('mvn_file'):
            cmd['cmvn_file'] = outputs['mvn_file']
        model_config = self.model_cfg['model_config']
        if model_config.__contains__('vad_model') and self.vad_model is None:
            self.vad_model = model_config['vad_model']
        if model_config.__contains__('vad_model_revision'):
            self.vad_model_revision = model_config['vad_model_revision']
        if model_config.__contains__('punc_model') and self.punc_model is None:
            self.punc_model = model_config['punc_model']
        if model_config.__contains__('punc_model_revision'):
            self.punc_model_revision = model_config['punc_model_revision']
        if model_config.__contains__(
                'timestamp_model') and self.timestamp_model is None:
            self.timestamp_model = model_config['timestamp_model']
        if model_config.__contains__('timestamp_model_revision'):
            self.timestamp_model_revision = model_config[
                'timestamp_model_revision']
        update_local_model(model_config, model_path, extra_args)
        self.load_vad_model(cmd)
        self.load_punc_model(cmd)
        self.load_lm_model(cmd)
        self.load_timestamp_model(cmd)
        user_args_dict = [
            'output_dir',
            'batch_size',
            'mode',
            'ngpu',
            'beam_size',
            'ctc_weight',
            'lm_weight',
            'decoding_ind',
            'decoding_mode',
            'vad_model_file',
            'vad_infer_config',
            'vad_cmvn_file',
            'punc_model_file',
            'punc_infer_config',
            'param_dict',
            'fake_streaming',
        ]
        for user_args in user_args_dict:
            if user_args in extra_args:
                if extra_args.get(user_args) is not None:
                    cmd[user_args] = extra_args[user_args]
                del extra_args[user_args]
        return cmd
    def load_vad_model(self, cmd):
        if self.vad_model is not None and self.vad_model != '':
            if os.path.exists(self.vad_model):
                vad_model = self.vad_model
            else:
                vad_model = snapshot_download(
                    self.vad_model, revision=self.vad_model_revision)
            logger.info('loading vad model from {0} ...'.format(vad_model))
            config_path = os.path.join(vad_model, ModelFile.CONFIGURATION)
            model_cfg = json.loads(open(config_path).read())
            model_dir = os.path.dirname(config_path)
            cmd['vad_model_file'] = os.path.join(
                model_dir,
                model_cfg['model']['model_config']['vad_model_name'])
            cmd['vad_infer_config'] = os.path.join(
                model_dir,
                model_cfg['model']['model_config']['vad_model_config'])
            cmd['vad_cmvn_file'] = os.path.join(
                model_dir, model_cfg['model']['model_config']['vad_mvn_file'])
            if 'vad' not in cmd['mode']:
                cmd['mode'] = cmd['mode'] + '_vad'
    def load_punc_model(self, cmd):
        if self.punc_model is not None and self.punc_model != '':
            if os.path.exists(self.punc_model):
                punc_model = self.punc_model
            else:
                punc_model = snapshot_download(
                    self.punc_model, revision=self.punc_model_revision)
            logger.info(
                'loading punctuation model from {0} ...'.format(punc_model))
            config_path = os.path.join(punc_model, ModelFile.CONFIGURATION)
            model_cfg = json.loads(open(config_path).read())
            model_dir = os.path.dirname(config_path)
            cmd['punc_model_file'] = os.path.join(
                model_dir, model_cfg['model']['punc_model_name'])
            cmd['punc_infer_config'] = os.path.join(
                model_dir,
                model_cfg['model']['punc_model_config']['punc_config'])
            if 'punc' not in cmd['mode']:
                cmd['mode'] = cmd['mode'] + '_punc'
    def load_lm_model(self, cmd):
        if self.lm_model is not None and self.lm_model != '':
            if os.path.exists(self.lm_model):
                lm_model = self.lm_model
            else:
                lm_model = snapshot_download(
                    self.lm_model, revision=self.lm_model_revision)
            logger.info('loading language model from {0} ...'.format(lm_model))
            config_path = os.path.join(lm_model, ModelFile.CONFIGURATION)
            model_cfg = json.loads(open(config_path).read())
            model_dir = os.path.dirname(config_path)
            cmd['lm_file'] = os.path.join(
                model_dir, model_cfg['model']['model_config']['lm_model_name'])
            cmd['lm_train_config'] = os.path.join(
                model_dir,
                model_cfg['model']['model_config']['lm_model_config'])
    # FIXME
    def load_timestamp_model(self, cmd):
        if self.timestamp_model is not None and self.timestamp_model != '':
            if os.path.exists(self.timestamp_model):
                timestamp_model = self.timestamp_model
            else:
                timestamp_model = snapshot_download(
                    self.timestamp_model,
                    revision=self.timestamp_model_revision)
            logger.info(
                'loading timestamp model from {0} ...'.format(timestamp_model))
            config_path = os.path.join(timestamp_model,
                                       ModelFile.CONFIGURATION)
            model_cfg = json.loads(open(config_path).read())
            model_dir = os.path.dirname(config_path)
            cmd['timestamp_model_file'] = os.path.join(
                model_dir,
                model_cfg['model']['model_config']['timestamp_model_file'])
            cmd['timestamp_infer_config'] = os.path.join(
                model_dir,
                model_cfg['model']['model_config']['timestamp_infer_config'])
            cmd['timestamp_cmvn_file'] = os.path.join(
                model_dir,
                model_cfg['model']['model_config']['timestamp_cmvn_file'])
    def forward(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        """Decoding
        """
        logger.info(f"Decoding with {inputs['audio_format']} files ...")
        data_cmd: Sequence[Tuple[str, str, str]]
        if isinstance(self.audio_in, bytes):
            data_cmd = [self.audio_in, 'speech', 'bytes']
        elif isinstance(self.audio_in, str):
            data_cmd = [self.audio_in, 'speech', 'sound']
        elif self.raw_inputs is not None:
            data_cmd = None
        # generate asr inference command
        self.cmd['name_and_type'] = data_cmd
        self.cmd['raw_inputs'] = self.raw_inputs
        self.cmd['audio_in'] = self.audio_in
        inputs['asr_result'] = self.run_inference(self.cmd, **kwargs)
        return inputs
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        """process the asr results
        """
        from funasr.utils import asr_utils
        logger.info('Computing the result of ASR ...')
        rst = {}
        # single wav or pcm task
        if inputs['recog_type'] == 'wav':
            if 'asr_result' in inputs and len(inputs['asr_result']) > 0:
                for key, value in inputs['asr_result'][0].items():
                    if key == 'value':
                        if len(value) > 0:
                            rst[OutputKeys.TEXT] = value
                    elif key != 'key':
                        rst[key] = value
        # run with datasets, and audio format is waveform or kaldi_ark or tfrecord
        elif inputs['recog_type'] != 'wav':
            inputs['reference_list'] = self.ref_list_tidy(inputs)
            inputs['datasets_result'] = asr_utils.compute_wer(
                hyp_list=inputs['asr_result'],
                ref_list=inputs['reference_list'])
        else:
            raise ValueError('recog_type and audio_format are mismatching')
        if 'datasets_result' in inputs:
            rst[OutputKeys.TEXT] = inputs['datasets_result']
        return rst
    def ref_list_tidy(self, inputs: Dict[str, Any]) -> List[Any]:
        ref_list = []
        if inputs['audio_format'] == 'tfrecord':
            # should assemble idx + txt
            with open(inputs['reference_text'], 'r', encoding='utf-8') as r:
                text_lines = r.readlines()
            with open(inputs['idx_text'], 'r', encoding='utf-8') as i:
                idx_lines = i.readlines()
            j: int = 0
            while j < min(len(text_lines), len(idx_lines)):
                idx_str = idx_lines[j].strip()
                text_str = text_lines[j].strip().replace(' ', '')
                item = {'key': idx_str, 'value': text_str}
                ref_list.append(item)
                j += 1
        else:
            # text contain idx + sentence
            with open(inputs['reference_text'], 'r', encoding='utf-8') as f:
                lines = f.readlines()
            for line in lines:
                line_item = line.split(None, 1)
                if len(line_item) > 1:
                    item = {
                        'key': line_item[0],
                        'value': line_item[1].strip('\n')
                    }
                    ref_list.append(item)
        return ref_list
    def run_inference(self, cmd, **kwargs):
        asr_result = self.funasr_infer_modelscope(cmd['name_and_type'],
                                                  cmd['raw_inputs'],
                                                  cmd['output_dir'], cmd['fs'],
                                                  cmd['param_dict'], **kwargs)
        return asr_result
--- a/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
@@ -35,7 +35,7 @@ class WeNetAutomaticSpeechRecognitionPipeline(Pipeline):
                 audio_fs: int = None,
                 recog_type: str = None,
                 audio_format: str = None) -> Dict[str, Any]:
-        from funasr.utils import asr_utils
+        # from funasr.utils import asr_utils
        self.recog_type = recog_type
        self.audio_format = audio_format
@@ -54,17 +54,17 @@ class WeNetAutomaticSpeechRecognitionPipeline(Pipeline):
        if checking_audio_fs is not None:
            self.audio_fs = checking_audio_fs
-        if recog_type is None or audio_format is None:
+        # if recog_type is None or audio_format is None:
-            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
+        #     self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
-                audio_in=self.audio_in,
+        #         audio_in=self.audio_in,
-                recog_type=recog_type,
+        #         recog_type=recog_type,
-                audio_format=audio_format)
+        #         audio_format=audio_format)
-        if hasattr(asr_utils, 'sample_rate_checking'):
+        # if hasattr(asr_utils, 'sample_rate_checking'):
-            checking_audio_fs = asr_utils.sample_rate_checking(
+        #     checking_audio_fs = asr_utils.sample_rate_checking(
-                self.audio_in, self.audio_format)
+        #         self.audio_in, self.audio_format)
-            if checking_audio_fs is not None:
+        #     if checking_audio_fs is not None:
-                self.audio_fs = checking_audio_fs
+        #         self.audio_fs = checking_audio_fs
        inputs = {
            'audio': self.audio_in,
--- a/modelscope/pipelines/audio/funasr_pipeline.py
+++ b/modelscope/pipelines/audio/funasr_pipeline.py
@@ -0,0 +1,75 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, List, Sequence, Tuple, Union
 import json
 import yaml
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
                                                update_local_model)
 from modelscope.utils.constant import Frameworks, ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['FunASRPipeline']
@PIPELINES.register_module(
    Tasks.auto_speech_recognition, module_name=Pipelines.funasr_pipeline)
@PIPELINES.register_module(
    Tasks.voice_activity_detection, module_name=Pipelines.funasr_pipeline)
@PIPELINES.register_module(
    Tasks.language_score_prediction, module_name=Pipelines.funasr_pipeline)
@PIPELINES.register_module(
    Tasks.punctuation, module_name=Pipelines.funasr_pipeline)
@PIPELINES.register_module(
    Tasks.speaker_diarization, module_name=Pipelines.funasr_pipeline)
@PIPELINES.register_module(
    Tasks.speaker_verification, module_name=Pipelines.funasr_pipeline)
@PIPELINES.register_module(
    Tasks.speech_separation, module_name=Pipelines.funasr_pipeline)
@PIPELINES.register_module(
    Tasks.speech_timestamp, module_name=Pipelines.funasr_pipeline)
@PIPELINES.register_module(
    Tasks.emotion_recognition, module_name=Pipelines.funasr_pipeline)
 class FunASRPipeline(Pipeline):
    """Voice Activity Detection Inference Pipeline
    use `model` to create a Voice Activity Detection pipeline.
    Args:
        model: A model instance, or a model local dir, or a model id in the model hub.
        kwargs (dict, `optional`):
            Extra kwargs passed into the preprocessor's constructor.
    Example:
        >>> from modelscope.pipelines import pipeline
        >>> p = pipeline(
        >>>    task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch')
        >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm'
        >>> print(p(audio_in))
    """
    def __init__(self, model: Union[Model, str] = None, **kwargs):
        """use `model` to create an vad pipeline for prediction
        """
        super().__init__(model=model, **kwargs)
    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
        """
        Decoding the input audios
        Args:
            input('str' or 'bytes'):
        Return:
            a list of dictionary of result.
        """
        output = self.model(*args, **kwargs)
        return output
--- a/modelscope/pipelines/audio/lm_infer_pipeline.py
+++ b/modelscope/pipelines/audio/lm_infer_pipeline.py
@@ -1,230 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, Union
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.audio.audio_utils import (generate_text_from_url,
                                                update_local_model)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Frameworks, ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['LanguageModelPipeline']
@PIPELINES.register_module(
    Tasks.language_score_prediction, module_name=Pipelines.lm_inference)
 class LanguageModelPipeline(Pipeline):
    """Language Model Inference Pipeline
    Example:
    >>> from modelscope.pipelines import pipeline
    >>> from modelscope.utils.constant import Tasks
    >>> inference_pipeline = pipeline(
    >>>    task=Tasks.language_score_prediction,
    >>>    model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch')
    >>> text_in='hello 大 家 好 呀'
    >>> print(inference_pipeline(text_in))
    """
    def __init__(self,
                 model: Union[Model, str] = None,
                 ngpu: int = 1,
                 **kwargs):
        """
        Use `model` to create a LM pipeline for prediction
        Args:
            model ('Model' or 'str'):
                The pipeline handles three types of model:
                - A model instance
                - A model local dir
                - A model id in the model hub
            output_dir('str'):
                output dir path
            batch_size('int'):
                the batch size for inference
            ngpu('int'):
                the number of gpus, 0 indicates CPU mode
            model_file('str'):
                LM model file
            train_config('str'):
                LM infer configuration
            num_workers('int'):
                the number of workers used for DataLoader
            log_level('str'):
                log level
            log_base('float', defaults to 10.0):
                the base of logarithm for Perplexity
            split_with_space('bool'):
                split the input sentence by space
            seg_dict_file('str'):
                seg dict file
            param_dict('dict'):
                extra kwargs
        """
        super().__init__(model=model, **kwargs)
        config_path = os.path.join(model, ModelFile.CONFIGURATION)
        self.cmd = self.get_cmd(config_path, kwargs, model)
        from funasr.bin import lm_inference_launch
        self.funasr_infer_modelscope = lm_inference_launch.inference_launch(
            mode=self.cmd['mode'],
            batch_size=self.cmd['batch_size'],
            dtype=self.cmd['dtype'],
            ngpu=ngpu,
            seed=self.cmd['seed'],
            num_workers=self.cmd['num_workers'],
            log_level=self.cmd['log_level'],
            key_file=self.cmd['key_file'],
            train_config=self.cmd['train_config'],
            model_file=self.cmd['model_file'],
            log_base=self.cmd['log_base'],
            split_with_space=self.cmd['split_with_space'],
            seg_dict_file=self.cmd['seg_dict_file'],
            output_dir=self.cmd['output_dir'],
            param_dict=self.cmd['param_dict'],
            **kwargs,
        )
    def __call__(self,
                 text_in: str = None,
                 output_dir: str = None,
                 param_dict: dict = None) -> Dict[str, Any]:
        """
        Compute PPL
        Args:
            text_in('str'):
                - A text str input
                - A local text file input endswith .txt or .scp
                - A url text file input
            output_dir('str'):
                output dir
            param_dict('dict'):
                extra kwargs
        Return:
            A dictionary of result or a list of dictionary of result.
            The dictionary contain the following keys:
            - **text** ('str') --The PPL result.
        """
        if len(text_in) == 0:
            raise ValueError('The input of lm should not be null.')
        else:
            self.text_in = text_in
        if output_dir is not None:
            self.cmd['output_dir'] = output_dir
        if param_dict is not None:
            self.cmd['param_dict'] = param_dict
        output = self.forward(self.text_in)
        result = self.postprocess(output)
        return result
    def postprocess(self, inputs: list) -> Dict[str, Any]:
        """Postprocessing
        """
        rst = {}
        for i in range(len(inputs)):
            if i == 0:
                text = inputs[0]['value']
                if len(text) > 0:
                    rst[OutputKeys.TEXT] = text
            else:
                rst[inputs[i]['key']] = inputs[i]['value']
        return rst
    def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]:
        # generate inference command
        model_cfg = Config.from_file(config_path)
        model_dir = os.path.dirname(config_path)
        mode = model_cfg.model['model_config']['mode']
        lm_model_path = os.path.join(
            model_dir, model_cfg.model['model_config']['lm_model_name'])
        lm_model_config = os.path.join(
            model_dir, model_cfg.model['model_config']['lm_model_config'])
        seg_dict_file = None
        if 'seg_dict_file' in model_cfg.model['model_config']:
            seg_dict_file = os.path.join(
                model_dir, model_cfg.model['model_config']['seg_dict_file'])
        update_local_model(model_cfg.model['model_config'], model_path,
                           extra_args)
        cmd = {
            'mode': mode,
            'batch_size': 1,
            'dtype': 'float32',
            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
            'seed': 0,
            'num_workers': 0,
            'log_level': 'ERROR',
            'key_file': None,
            'train_config': lm_model_config,
            'model_file': lm_model_path,
            'log_base': 10.0,
            'allow_variable_data_keys': False,
            'split_with_space': True,
            'seg_dict_file': seg_dict_file,
            'output_dir': None,
            'param_dict': None,
        }
        user_args_dict = [
            'batch_size',
            'ngpu',
            'num_workers',
            'log_level',
            'train_config',
            'model_file',
            'log_base',
            'split_with_space',
            'seg_dict_file',
            'output_dir',
            'param_dict',
        ]
        for user_args in user_args_dict:
            if user_args in extra_args:
                if extra_args.get(user_args) is not None:
                    cmd[user_args] = extra_args[user_args]
                del extra_args[user_args]
        return cmd
    def forward(self, text_in: str = None) -> list:
        """Decoding
        """
        logger.info('Compute PPL : {0} ...'.format(text_in))
        # generate text_in
        text_file, raw_inputs = generate_text_from_url(text_in)
        data_cmd = None
        if raw_inputs is None:
            data_cmd = [(text_file, 'text', 'text')]
        elif text_file is None and raw_inputs is not None:
            data_cmd = None
        self.cmd['name_and_type'] = data_cmd
        self.cmd['raw_inputs'] = raw_inputs
        lm_result = self.run_inference(self.cmd)
        return lm_result
    def run_inference(self, cmd):
        if self.framework == Frameworks.torch:
            lm_result = self.funasr_infer_modelscope(
                data_path_and_name_and_type=cmd['name_and_type'],
                raw_inputs=cmd['raw_inputs'],
                output_dir_v2=cmd['output_dir'],
                param_dict=cmd['param_dict'])
        else:
            raise ValueError('model type is mismatching')
        return lm_result
--- a/modelscope/pipelines/audio/punctuation_processing_pipeline.py
+++ b/modelscope/pipelines/audio/punctuation_processing_pipeline.py
@@ -1,183 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import shutil
 from typing import Any, Dict, List, Sequence, Tuple, Union
 import yaml
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.audio.audio_utils import (generate_text_from_url,
                                                update_local_model)
 from modelscope.utils.constant import Frameworks, Tasks
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['PunctuationProcessingPipeline']
@PIPELINES.register_module(
    Tasks.punctuation, module_name=Pipelines.punc_inference)
 class PunctuationProcessingPipeline(Pipeline):
    """Punctuation Processing Inference Pipeline
    use `model` to create a Punctuation Processing pipeline.
    Args:
        model (PunctuationProcessingPipeline): A model instance, or a model local dir, or a model id in the model hub.
        kwargs (dict, `optional`):
            Extra kwargs passed into the preprocessor's constructor.
    Examples
    >>> from modelscope.pipelines import pipeline
    >>> pipeline_punc = pipeline(
    >>>    task=Tasks.punctuation, model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch')
    >>> text_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt'
    >>> print(pipeline_punc(text_in))
    """
    def __init__(self,
                 model: Union[Model, str] = None,
                 ngpu: int = 1,
                 **kwargs):
        """use `model` to create an asr pipeline for prediction
        """
        super().__init__(model=model, **kwargs)
        self.model_cfg = self.model.forward()
        self.cmd = self.get_cmd(kwargs, model)
        from funasr.bin import punc_inference_launch
        self.funasr_infer_modelscope = punc_inference_launch.inference_launch(
            mode=self.cmd['mode'],
            batch_size=self.cmd['batch_size'],
            dtype=self.cmd['dtype'],
            ngpu=ngpu,
            seed=self.cmd['seed'],
            num_workers=self.cmd['num_workers'],
            log_level=self.cmd['log_level'],
            key_file=self.cmd['key_file'],
            train_config=self.cmd['train_config'],
            model_file=self.cmd['model_file'],
            output_dir=self.cmd['output_dir'],
            param_dict=self.cmd['param_dict'],
            **kwargs,
        )
    def __call__(self,
                 text_in: str = None,
                 output_dir: str = None,
                 cache: List[Any] = None,
                 param_dict: dict = None) -> Dict[str, Any]:
        if len(text_in) == 0:
            raise ValueError('The input of punctuation should not be null.')
        else:
            self.text_in = text_in
        if output_dir is not None:
            self.cmd['output_dir'] = output_dir
        if cache is not None:
            self.cmd['cache'] = cache
        if param_dict is not None:
            self.cmd['param_dict'] = param_dict
        output = self.forward(self.text_in)
        result = self.postprocess(output)
        return result
    def postprocess(self, inputs: list) -> Dict[str, Any]:
        """Postprocessing
        """
        rst = {}
        for i in range(len(inputs)):
            if i == 0:
                for key, value in inputs[0].items():
                    if key == 'value':
                        if len(value) > 0:
                            rst[OutputKeys.TEXT] = value
                    elif key != 'key':
                        rst[key] = value
            else:
                rst[inputs[i]['key']] = inputs[i]['value']
        return rst
    def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
        # generate inference command
        lang = self.model_cfg['model_config']['lang']
        punc_model_path = self.model_cfg['punc_model_path']
        punc_model_config = os.path.join(
            self.model_cfg['model_workspace'],
            self.model_cfg['model_config']['punc_config'])
        mode = self.model_cfg['model_config']['mode']
        update_local_model(self.model_cfg['model_config'], model_path,
                           extra_args)
        cmd = {
            'mode': mode,
            'batch_size': 1,
            'dtype': 'float32',
            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
            'seed': 0,
            'num_workers': 0,
            'log_level': 'ERROR',
            'key_file': None,
            'train_config': punc_model_config,
            'model_file': punc_model_path,
            'output_dir': None,
            'lang': lang,
            'cache': None,
            'param_dict': None,
        }
        user_args_dict = [
            'batch_size',
            'dtype',
            'ngpu',
            'seed',
            'num_workers',
            'log_level',
            'train_config',
            'model_file',
            'output_dir',
            'lang',
            'param_dict',
        ]
        for user_args in user_args_dict:
            if user_args in extra_args:
                if extra_args.get(user_args) is not None:
                    cmd[user_args] = extra_args[user_args]
                del extra_args[user_args]
        return cmd
    def forward(self, text_in: str = None) -> list:
        """Decoding
        """
        logger.info('Punctuation Processing: {0} ...'.format(text_in))
        # generate text_in
        text_file, raw_inputs = generate_text_from_url(text_in)
        if raw_inputs is None:
            data_cmd = [(text_file, 'text', 'text')]
        elif text_file is None and raw_inputs is not None:
            data_cmd = None
        self.cmd['name_and_type'] = data_cmd
        self.cmd['raw_inputs'] = raw_inputs
        punc_result = self.run_inference(self.cmd)
        return punc_result
    def run_inference(self, cmd):
        punc_result = ''
        if self.framework == Frameworks.torch:
            punc_result = self.funasr_infer_modelscope(
                data_path_and_name_and_type=cmd['name_and_type'],
                raw_inputs=cmd['raw_inputs'],
                output_dir_v2=cmd['output_dir'],
                cache=cmd['cache'],
                param_dict=cmd['param_dict'])
        else:
            raise ValueError('model type is mismatching')
        return punc_result
--- a/modelscope/pipelines/audio/speaker_diarization_pipeline.py
+++ b/modelscope/pipelines/audio/speaker_diarization_pipeline.py
@@ -1,287 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import shutil
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 import json
 import numpy
 import yaml
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.audio.audio_utils import (generate_scp_for_sv,
                                                generate_sd_scp_from_url,
                                                update_local_model)
 from modelscope.utils.constant import Frameworks, ModelFile, Tasks
 from modelscope.utils.hub import snapshot_download
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['SpeakerDiarizationPipeline']
@PIPELINES.register_module(
    Tasks.speaker_diarization,
    module_name=Pipelines.speaker_diarization_inference)
 class SpeakerDiarizationPipeline(Pipeline):
    """Speaker Diarization Inference Pipeline
    use `model` to create a Speaker Diarization pipeline.
    Args:
        model (SpeakerDiarizationPipeline): A model instance, or a model local dir, or a model id in the model hub.
        kwargs (dict, `optional`):
            Extra kwargs passed into the preprocessor's constructor.
    Examples:
        >>> from modelscope.pipelines import pipeline
        >>> pipeline_sd = pipeline(
        >>>    task=Tasks.speaker_diarization, model='damo/xxxxxxxxxxxxx')
        >>> audio_in=('','','','')
        >>> print(pipeline_sd(audio_in))
    """
    def __init__(self,
                 model: Union[Model, str] = None,
                 sv_model: Optional[Union[Model, str]] = None,
                 sv_model_revision: Optional[str] = None,
                 ngpu: int = 1,
                 **kwargs):
        """use `model` to create a speaker diarization pipeline for prediction
        Args:
            model ('Model' or 'str'):
                The pipeline handles three types of model:
                - A model instance
                - A model local dir
                - A model id in the model hub
            sv_model (Optional: 'Model' or 'str'):
                speaker verification model from model hub or local
                example: 'damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch'
            sv_model_revision (Optional: 'str'):
                speaker verfication model revision from model hub
        """
        super().__init__(model=model, **kwargs)
        self.model_cfg = None
        config_path = os.path.join(model, ModelFile.CONFIGURATION)
        self.sv_model = sv_model
        self.sv_model_revision = sv_model_revision
        self.cmd = self.get_cmd(config_path, kwargs, model)
        from funasr.bin import diar_inference_launch
        self.funasr_infer_modelscope = diar_inference_launch.inference_launch(
            mode=self.cmd['mode'],
            output_dir=self.cmd['output_dir'],
            batch_size=self.cmd['batch_size'],
            dtype=self.cmd['dtype'],
            ngpu=ngpu,
            seed=self.cmd['seed'],
            num_workers=self.cmd['num_workers'],
            log_level=self.cmd['log_level'],
            key_file=self.cmd['key_file'],
            diar_train_config=self.cmd['diar_train_config'],
            diar_model_file=self.cmd['diar_model_file'],
            model_tag=self.cmd['model_tag'],
            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
            streaming=self.cmd['streaming'],
            smooth_size=self.cmd['smooth_size'],
            dur_threshold=self.cmd['dur_threshold'],
            out_format=self.cmd['out_format'],
            param_dict=self.cmd['param_dict'],
            **kwargs,
        )
    def __call__(self,
                 audio_in: Union[tuple, str, Any] = None,
                 output_dir: str = None,
                 param_dict: dict = None) -> Dict[str, Any]:
        """
        Decoding the input audios
        Args:
            audio_in('str' or 'bytes'):
                - A string containing a local path to a wav file
                - A string containing a local path to a scp
                - A string containing a wav url
                - A bytes input
            output_dir('str'):
                output dir
            param_dict('dict'):
                extra kwargs
        Return:
            A dictionary of result or a list of dictionary of result.
            The dictionary contain the following keys:
            - **text** ('str') --The speaker diarization result.
        """
        if len(audio_in) == 0:
            raise ValueError('The input of sv should not be null.')
        else:
            self.audio_in = audio_in
        if output_dir is not None:
            self.cmd['output_dir'] = output_dir
        self.cmd['param_dict'] = param_dict
        output = self.forward(self.audio_in)
        result = self.postprocess(output)
        return result
    def postprocess(self, inputs: list) -> Dict[str, Any]:
        """Postprocessing
        """
        rst = {}
        for i in range(len(inputs)):
            # for demo service
            if i == 0 and len(inputs) == 1:
                rst[OutputKeys.TEXT] = inputs[0]['value']
            else:
                rst[inputs[i]['key']] = inputs[i]['value']
        return rst
    def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]:
        self.model_cfg = json.loads(open(config_path).read())
        model_dir = os.path.dirname(config_path)
        # generate sd inference command
        mode = self.model_cfg['model']['model_config']['mode']
        diar_model_path = os.path.join(
            model_dir,
            self.model_cfg['model']['model_config']['diar_model_name'])
        diar_model_config = os.path.join(
            model_dir,
            self.model_cfg['model']['model_config']['diar_model_config'])
        update_local_model(self.model_cfg['model']['model_config'], model_path,
                           extra_args)
        cmd = {
            'mode': mode,
            'output_dir': None,
            'batch_size': 1,
            'dtype': 'float32',
            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
            'seed': 0,
            'num_workers': 0,
            'log_level': 'ERROR',
            'key_file': None,
            'diar_model_file': diar_model_path,
            'diar_train_config': diar_model_config,
            'model_tag': None,
            'allow_variable_data_keys': True,
            'streaming': False,
            'smooth_size': 83,
            'dur_threshold': 10,
            'out_format': 'vad',
            'param_dict': {
                'sv_model_file': None,
                'sv_train_config': None
            },
        }
        user_args_dict = [
            'mode',
            'output_dir',
            'batch_size',
            'ngpu',
            'log_level',
            'allow_variable_data_keys',
            'streaming',
            'num_workers',
            'smooth_size',
            'dur_threshold',
            'out_format',
            'param_dict',
        ]
        model_config = self.model_cfg['model']['model_config']
        if model_config.__contains__('sv_model') and self.sv_model != '':
            self.sv_model = model_config['sv_model']
        if model_config.__contains__('sv_model_revision'):
            self.sv_model_revision = model_config['sv_model_revision']
        self.load_sv_model(cmd)
        # rewrite the config with user args
        for user_args in user_args_dict:
            if user_args in extra_args:
                if extra_args.get(user_args) is not None:
                    if isinstance(cmd[user_args], dict) and isinstance(
                            extra_args[user_args], dict):
                        cmd[user_args].update(extra_args[user_args])
                    else:
                        cmd[user_args] = extra_args[user_args]
                del extra_args[user_args]
        return cmd
    def load_sv_model(self, cmd):
        if self.sv_model is not None and self.sv_model != '':
            if os.path.exists(self.sv_model):
                sv_model = self.sv_model
            else:
                sv_model = snapshot_download(
                    self.sv_model, revision=self.sv_model_revision)
            logger.info(
                'loading speaker verification model from {0} ...'.format(
                    sv_model))
            config_path = os.path.join(sv_model, ModelFile.CONFIGURATION)
            model_cfg = json.loads(open(config_path).read())
            model_dir = os.path.dirname(config_path)
            cmd['param_dict']['sv_model_file'] = os.path.join(
                model_dir, model_cfg['model']['model_config']['sv_model_name'])
            cmd['param_dict']['sv_train_config'] = os.path.join(
                model_dir,
                model_cfg['model']['model_config']['sv_model_config'])
    def forward(self, audio_in: Union[tuple, str, Any] = None) -> list:
        """Decoding
        """
        # log  file_path/url or tuple (str, str)
        if isinstance(audio_in, str) or \
                (isinstance(audio_in, tuple) and all(isinstance(item, str) for item in audio_in)):
            logger.info(f'Speaker Verification Processing: {audio_in} ...')
        else:
            logger.info(
                f'Speaker Verification Processing: {str(audio_in)[:100]} ...')
        data_cmd, raw_inputs = None, None
        if isinstance(audio_in, tuple) or isinstance(audio_in, list):
            # generate audio_scp
            if isinstance(audio_in[0], str):
                # for scp inputs
                if len(audio_in[0].split(',')) == 3 and audio_in[0].split(
                        ',')[0].endswith('.scp'):
                    data_cmd = []
                    for audio_cmd in audio_in:
                        if len(audio_cmd.split(',')) == 3 and audio_cmd.split(
                                ',')[0].endswith('.scp'):
                            data_cmd.append(tuple(audio_cmd.split(',')))
                # for audio-list inputs
                else:
                    raw_inputs = generate_sd_scp_from_url(audio_in)
            # for raw bytes inputs
            elif isinstance(audio_in[0], (bytes, numpy.ndarray)):
                raw_inputs = audio_in
            else:
                raise TypeError(
                    'Unsupported data type, it must be data_name_type_path, '
                    'file_path, url, bytes or numpy.ndarray')
        else:
            raise TypeError(
                'audio_in must be a list of data_name_type_path, file_path, '
                'url, bytes or numpy.ndarray')
        self.cmd['name_and_type'] = data_cmd
        self.cmd['raw_inputs'] = raw_inputs
        result = self.run_inference(self.cmd)
        return result
    def run_inference(self, cmd):
        if self.framework == Frameworks.torch:
            diar_result = self.funasr_infer_modelscope(
                data_path_and_name_and_type=cmd['name_and_type'],
                raw_inputs=cmd['raw_inputs'],
                output_dir_v2=cmd['output_dir'],
                param_dict=cmd['param_dict'])
        else:
            raise ValueError(
                'framework is mismatching, which should be pytorch')
        return diar_result
--- a/modelscope/pipelines/audio/speaker_verification_pipeline.py
+++ b/modelscope/pipelines/audio/speaker_verification_pipeline.py
@@ -1,264 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import shutil
 from typing import Any, Dict, List, Sequence, Tuple, Union
 import yaml
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.audio.audio_utils import (generate_scp_for_sv,
                                                generate_sv_scp_from_url,
                                                update_local_model)
 from modelscope.utils.constant import Frameworks, Tasks
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['SpeakerVerificationPipeline']
@PIPELINES.register_module(
    Tasks.speaker_verification, module_name=Pipelines.sv_inference)
 class SpeakerVerificationPipeline(Pipeline):
    """Speaker Verification Inference Pipeline
    use `model` to create a Speaker Verification pipeline.
    Args:
        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
        kwargs (dict, `optional`):
            Extra kwargs passed into the preprocessor's constructor.
    Examples:
        >>> from modelscope.pipelines import pipeline
        >>> pipeline_sv = pipeline(
        >>>    task=Tasks.speaker_verification, model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch')
        >>> audio_in=('sv_example_enroll.wav', 'sv_example_same.wav')
        >>> print(pipeline_sv(audio_in))
        >>> # {'label': ['Same', 'Different'], 'scores': [0.8540488358969999, 0.14595116410300013]}
    """
    def __init__(self,
                 model: Union[Model, str] = None,
                 ngpu: int = 1,
                 **kwargs):
        """use `model` to create an asr pipeline for prediction
        """
        super().__init__(model=model, **kwargs)
        self.model_cfg = self.model.forward()
        self.cmd = self.get_cmd(kwargs, model)
        from funasr.bin import sv_inference_launch
        self.funasr_infer_modelscope = sv_inference_launch.inference_launch(
            mode=self.cmd['mode'],
            output_dir=self.cmd['output_dir'],
            batch_size=self.cmd['batch_size'],
            dtype=self.cmd['dtype'],
            ngpu=ngpu,
            seed=self.cmd['seed'],
            num_workers=self.cmd['num_workers'],
            log_level=self.cmd['log_level'],
            key_file=self.cmd['key_file'],
            sv_train_config=self.cmd['sv_train_config'],
            sv_model_file=self.cmd['sv_model_file'],
            model_tag=self.cmd['model_tag'],
            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
            streaming=self.cmd['streaming'],
            embedding_node=self.cmd['embedding_node'],
            sv_threshold=self.cmd['sv_threshold'],
            param_dict=self.cmd['param_dict'],
            **kwargs,
        )
    def __call__(self,
                 audio_in: Union[tuple, str, Any] = None,
                 output_dir: str = None,
                 param_dict: dict = None) -> Dict[str, Any]:
        if len(audio_in) == 0:
            raise ValueError('The input of sv should not be null.')
        else:
            self.audio_in = audio_in
        if output_dir is not None:
            self.cmd['output_dir'] = output_dir
        self.cmd['param_dict'] = param_dict
        output = self.forward(self.audio_in)
        result = self.postprocess(output)
        return result
    def postprocess(self, inputs: list) -> Dict[str, Any]:
        """Postprocessing
        """
        rst = {}
        for i in range(len(inputs)):
            # for single input, re-formate the output
            # audio_in:
            #   list/tuple: return speaker verification scores
            #   single wav/bytes: return speaker embedding
            if len(inputs) == 1 and i == 0:
                if isinstance(self.audio_in, tuple) or isinstance(
                        self.audio_in, list):
                    score = inputs[0]['value']
                    rst[OutputKeys.LABEL] = ['Same', 'Different']
                    rst[OutputKeys.SCORES] = [score / 100.0, 1 - score / 100.0]
                else:
                    embedding = inputs[0]['value']
                    rst[OutputKeys.SPK_EMBEDDING] = embedding
            else:
                # for multiple inputs
                rst[inputs[i]['key']] = inputs[i]['value']
        return rst
    def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
        # generate asr inference command
        mode = self.model_cfg['model_config']['mode']
        sv_model_path = self.model_cfg['model_path']
        sv_model_config = os.path.join(
            self.model_cfg['model_workspace'],
            self.model_cfg['model_config']['sv_model_config'])
        update_local_model(self.model_cfg['model_config'], model_path,
                           extra_args)
        cmd = {
            'mode': mode,
            'output_dir': None,
            'batch_size': 1,
            'dtype': 'float32',
            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
            'seed': 0,
            'num_workers': 0,
            'log_level': 'ERROR',
            'key_file': None,
            'sv_model_file': sv_model_path,
            'sv_train_config': sv_model_config,
            'model_tag': None,
            'allow_variable_data_keys': True,
            'streaming': False,
            'embedding_node': 'resnet1_dense',
            'sv_threshold': 0.9465,
            'param_dict': None,
        }
        user_args_dict = [
            'output_dir',
            'batch_size',
            'ngpu',
            'embedding_node',
            'sv_threshold',
            'log_level',
            'allow_variable_data_keys',
            'streaming',
            'num_workers',
            'param_dict',
        ]
        # re-write the config with configure.json
        for user_args in user_args_dict:
            if (user_args in self.model_cfg['model_config']
                    and self.model_cfg['model_config'][user_args] is not None):
                if isinstance(cmd[user_args], dict) and isinstance(
                        self.model_cfg['model_config'][user_args], dict):
                    cmd[user_args].update(
                        self.model_cfg['model_config'][user_args])
                else:
                    cmd[user_args] = self.model_cfg['model_config'][user_args]
        # rewrite the config with user args
        for user_args in user_args_dict:
            if user_args in extra_args:
                if extra_args.get(user_args) is not None:
                    if isinstance(cmd[user_args], dict) and isinstance(
                            extra_args[user_args], dict):
                        cmd[user_args].update(extra_args[user_args])
                    else:
                        cmd[user_args] = extra_args[user_args]
                del extra_args[user_args]
        return cmd
    def forward(self, audio_in: Union[tuple, str, Any] = None) -> list:
        """Decoding
        """
        # log  file_path/url or tuple (str, str)
        if isinstance(audio_in, str) or \
                (isinstance(audio_in, tuple) and all(isinstance(item, str) for item in audio_in)):
            logger.info(f'Speaker Verification Processing: {audio_in} ...')
        else:
            logger.info(
                f'Speaker Verification Processing: {str(audio_in)[:100]} ...')
        data_cmd, raw_inputs = None, None
        if isinstance(audio_in, tuple) or isinstance(audio_in, list):
            # generate audio_scp
            assert len(audio_in) == 2
            if isinstance(audio_in[0], str):
                # for scp inputs
                if len(audio_in[0].split(',')) == 3 and audio_in[0].split(
                        ',')[0].endswith('.scp'):
                    if len(audio_in[1].split(',')) == 3 and audio_in[1].split(
                            ',')[0].endswith('.scp'):
                        data_cmd = [
                            tuple(audio_in[0].split(',')),
                            tuple(audio_in[1].split(','))
                        ]
                # for single-file inputs
                else:
                    audio_scp_1, audio_scp_2 = generate_sv_scp_from_url(
                        audio_in)
                    if isinstance(audio_scp_1, bytes) and isinstance(
                            audio_scp_2, bytes):
                        data_cmd = [(audio_scp_1, 'speech', 'bytes'),
                                    (audio_scp_2, 'ref_speech', 'bytes')]
                    else:
                        data_cmd = [(audio_scp_1, 'speech', 'sound'),
                                    (audio_scp_2, 'ref_speech', 'sound')]
            # for raw bytes inputs
            elif isinstance(audio_in[0], bytes):
                data_cmd = [(audio_in[0], 'speech', 'bytes'),
                            (audio_in[1], 'ref_speech', 'bytes')]
            else:
                raise TypeError('Unsupported data type.')
        else:
            if isinstance(audio_in, str):
                # for scp inputs
                if len(audio_in.split(',')) == 3:
                    data_cmd = [audio_in.split(',')]
                # for single-file inputs
                else:
                    audio_scp = generate_scp_for_sv(audio_in)
                    if isinstance(audio_scp, bytes):
                        data_cmd = [(audio_scp, 'speech', 'bytes')]
                    else:
                        data_cmd = [(audio_scp, 'speech', 'sound')]
            # for raw bytes
            elif isinstance(audio_in, bytes):
                data_cmd = [(audio_in, 'speech', 'bytes')]
            # for ndarray and tensor inputs
            else:
                import torch
                import numpy as np
                if isinstance(audio_in, torch.Tensor):
                    raw_inputs = audio_in
                elif isinstance(audio_in, np.ndarray):
                    raw_inputs = audio_in
                else:
                    raise TypeError('Unsupported data type.')
        self.cmd['name_and_type'] = data_cmd
        self.cmd['raw_inputs'] = raw_inputs
        result = self.run_inference(self.cmd)
        return result
    def run_inference(self, cmd):
        if self.framework == Frameworks.torch:
            sv_result = self.funasr_infer_modelscope(
                data_path_and_name_and_type=cmd['name_and_type'],
                raw_inputs=cmd['raw_inputs'],
                output_dir_v2=cmd['output_dir'],
                param_dict=cmd['param_dict'])
        else:
            raise ValueError('model type is mismatching')
        return sv_result
--- a/modelscope/pipelines/audio/timestamp_pipeline.py
+++ b/modelscope/pipelines/audio/timestamp_pipeline.py
@@ -1,317 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, List, Sequence, Tuple, Union
 import json
 import yaml
 from funasr.utils import asr_utils
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
                                                update_local_model)
 from modelscope.utils.constant import Frameworks, ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['TimestampPipeline']
@PIPELINES.register_module(
    Tasks.speech_timestamp, module_name=Pipelines.speech_timestamp_inference)
 class TimestampPipeline(Pipeline):
    """Timestamp Inference Pipeline
    Example:
    >>> from modelscope.pipelines import pipeline
    >>> from modelscope.utils.constant import Tasks
    >>> pipeline_infer = pipeline(
    >>>    task=Tasks.speech_timestamp,
    >>>    model='damo/speech_timestamp_predictor-v1-16k-offline')
    >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav'
    >>> text_in='一 个 东 太 平 洋 国 家 为 什 么 跑 到 西 太 平 洋 来 了 呢'
    >>> print(pipeline_infer(audio_in, text_in))
    """
    def __init__(self,
                 model: Union[Model, str] = None,
                 ngpu: int = 1,
                 **kwargs):
        """
        Use `model` and `preprocessor` to create an asr pipeline for prediction
        Args:
            model ('Model' or 'str'):
                The pipeline handles three types of model:
                - A model instance
                - A model local dir
                - A model id in the model hub
            output_dir('str'):
                output dir path
            batch_size('int'):
                the batch size for inference
            ngpu('int'):
                the number of gpus, 0 indicates CPU mode
            split_with_space('bool'):
                split the input sentence by space
            seg_dict_file('str'):
                seg dict file
            param_dict('dict'):
                extra kwargs
        """
        super().__init__(model=model, **kwargs)
        config_path = os.path.join(model, ModelFile.CONFIGURATION)
        self.cmd = self.get_cmd(config_path, kwargs, model)
        from funasr.bin import tp_inference_launch
        self.funasr_infer_modelscope = tp_inference_launch.inference_launch(
            mode=self.cmd['mode'],
            batch_size=self.cmd['batch_size'],
            dtype=self.cmd['dtype'],
            ngpu=ngpu,
            seed=self.cmd['seed'],
            num_workers=self.cmd['num_workers'],
            log_level=self.cmd['log_level'],
            key_file=self.cmd['key_file'],
            timestamp_infer_config=self.cmd['timestamp_infer_config'],
            timestamp_model_file=self.cmd['timestamp_model_file'],
            timestamp_cmvn_file=self.cmd['timestamp_cmvn_file'],
            output_dir=self.cmd['output_dir'],
            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
            split_with_space=self.cmd['split_with_space'],
            seg_dict_file=self.cmd['seg_dict_file'],
            param_dict=self.cmd['param_dict'],
            **kwargs,
        )
    def __call__(self,
                 audio_in: Union[str, bytes],
                 text_in: str,
                 audio_fs: int = None,
                 recog_type: str = None,
                 audio_format: str = None,
                 output_dir: str = None,
                 param_dict: dict = None,
                 **kwargs) -> Dict[str, Any]:
        """
        Decoding the input audios
        Args:
            audio_in('str' or 'bytes'):
                - A string containing a local path to a wav file
                - A string containing a local path to a scp
                - A string containing a wav url
            text_in('str'):
                - A text str input
                - A local text file input endswith .txt or .scp
            audio_fs('int'):
                frequency of sample
            recog_type('str'):
                recog type for wav file or datasets file ('wav', 'test', 'dev', 'train')
            audio_format('str'):
                audio format ('pcm', 'scp', 'kaldi_ark', 'tfrecord')
            output_dir('str'):
                output dir
            param_dict('dict'):
                extra kwargs
        Return:
            A dictionary of result or a list of dictionary of result.
            The dictionary contain the following keys:
            - **text** ('str') --The timestamp result.
        """
        self.audio_in = None
        self.text_in = None
        self.raw_inputs = None
        self.recog_type = recog_type
        self.audio_format = audio_format
        self.audio_fs = None
        checking_audio_fs = None
        if output_dir is not None:
            self.cmd['output_dir'] = output_dir
        if param_dict is not None:
            self.cmd['param_dict'] = param_dict
        # audio
        if isinstance(audio_in, str):
            # for funasr code, generate wav.scp from url or local path
            self.audio_in, self.raw_inputs = generate_scp_from_url(audio_in)
        elif isinstance(audio_in, bytes):
            self.audio_in = audio_in
            self.raw_inputs = None
        else:
            import numpy
            import torch
            if isinstance(audio_in, torch.Tensor):
                self.audio_in = None
                self.raw_inputs = audio_in
            elif isinstance(audio_in, numpy.ndarray):
                self.audio_in = None
                self.raw_inputs = audio_in
        # text
        if text_in.startswith('http'):
            self.text_in, _ = generate_text_from_url(text_in)
        else:
            self.text_in = text_in
        # set the sample_rate of audio_in if checking_audio_fs is valid
        if checking_audio_fs is not None:
            self.audio_fs = checking_audio_fs
        if recog_type is None or audio_format is None:
            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
                audio_in=self.audio_in,
                recog_type=recog_type,
                audio_format=audio_format)
        if hasattr(asr_utils,
                   'sample_rate_checking') and self.audio_in is not None:
            checking_audio_fs = asr_utils.sample_rate_checking(
                self.audio_in, self.audio_format)
            if checking_audio_fs is not None:
                self.audio_fs = checking_audio_fs
        if audio_fs is not None:
            self.cmd['fs']['audio_fs'] = audio_fs
        else:
            self.cmd['fs']['audio_fs'] = self.audio_fs
        output = self.forward(self.audio_in, self.text_in, **kwargs)
        result = self.postprocess(output)
        return result
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        """Postprocessing
        """
        rst = {}
        for i in range(len(inputs)):
            if i == 0:
                for key, value in inputs[0].items():
                    if key == 'value':
                        if len(value) > 0:
                            rst[OutputKeys.TEXT] = value
                    elif key != 'key':
                        rst[key] = value
            else:
                rst[inputs[i]['key']] = inputs[i]['value']
        return rst
    def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]:
        model_cfg = json.loads(open(config_path).read())
        model_dir = os.path.dirname(config_path)
        # generate inference command
        timestamp_model_file = os.path.join(
            model_dir,
            model_cfg['model']['model_config']['timestamp_model_file'])
        timestamp_infer_config = os.path.join(
            model_dir,
            model_cfg['model']['model_config']['timestamp_infer_config'])
        timestamp_cmvn_file = os.path.join(
            model_dir,
            model_cfg['model']['model_config']['timestamp_cmvn_file'])
        mode = model_cfg['model']['model_config']['mode']
        frontend_conf = None
        if os.path.exists(timestamp_infer_config):
            config_file = open(timestamp_infer_config, encoding='utf-8')
            root = yaml.full_load(config_file)
            config_file.close()
            if 'frontend_conf' in root:
                frontend_conf = root['frontend_conf']
        seg_dict_file = None
        if 'seg_dict_file' in model_cfg['model']['model_config']:
            seg_dict_file = os.path.join(
                model_dir, model_cfg['model']['model_config']['seg_dict_file'])
        update_local_model(model_cfg['model']['model_config'], model_path,
                           extra_args)
        cmd = {
            'mode': mode,
            'batch_size': 1,
            'dtype': 'float32',
            'ngpu': 0,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
            'seed': 0,
            'num_workers': 0,
            'log_level': 'ERROR',
            'key_file': None,
            'allow_variable_data_keys': False,
            'split_with_space': True,
            'seg_dict_file': seg_dict_file,
            'timestamp_infer_config': timestamp_infer_config,
            'timestamp_model_file': timestamp_model_file,
            'timestamp_cmvn_file': timestamp_cmvn_file,
            'output_dir': None,
            'param_dict': None,
            'fs': {
                'model_fs': None,
                'audio_fs': None
            }
        }
        if frontend_conf is not None and 'fs' in frontend_conf:
            cmd['fs']['model_fs'] = frontend_conf['fs']
        user_args_dict = [
            'output_dir',
            'batch_size',
            'mode',
            'ngpu',
            'param_dict',
            'num_workers',
            'log_level',
            'split_with_space',
            'seg_dict_file',
        ]
        for user_args in user_args_dict:
            if user_args in extra_args:
                if extra_args.get(user_args) is not None:
                    cmd[user_args] = extra_args[user_args]
                del extra_args[user_args]
        return cmd
    def forward(self, audio_in: Dict[str, Any], text_in: Dict[str, Any],
                **kwargs) -> Dict[str, Any]:
        """Decoding
        """
        logger.info('Timestamp Processing ...')
        # generate inputs
        data_cmd: Sequence[Tuple[str, str, str]]
        if isinstance(self.audio_in, bytes):
            data_cmd = [(self.audio_in, 'speech', 'bytes')]
            data_cmd.append((text_in, 'text', 'text'))
        elif isinstance(self.audio_in, str):
            data_cmd = [(self.audio_in, 'speech', 'sound')]
            data_cmd.append((text_in, 'text', 'text'))
        elif self.raw_inputs is not None:
            data_cmd = None
        if self.raw_inputs is None and data_cmd is None:
            raise ValueError('please check audio_in')
        self.cmd['name_and_type'] = data_cmd
        self.cmd['raw_inputs'] = self.raw_inputs
        self.cmd['audio_in'] = self.audio_in
        tp_result = self.run_inference(self.cmd, **kwargs)
        return tp_result
    def run_inference(self, cmd, **kwargs):
        tp_result = []
        if self.framework == Frameworks.torch:
            tp_result = self.funasr_infer_modelscope(
                data_path_and_name_and_type=cmd['name_and_type'],
                raw_inputs=cmd['raw_inputs'],
                output_dir_v2=cmd['output_dir'],
                fs=cmd['fs'],
                param_dict=cmd['param_dict'],
                **kwargs)
        else:
            raise ValueError('model type is mismatching')
        return tp_result
--- a/modelscope/pipelines/audio/voice_activity_detection_pipeline.py
+++ b/modelscope/pipelines/audio/voice_activity_detection_pipeline.py
@@ -1,255 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, List, Sequence, Tuple, Union
 import json
 import yaml
 from funasr.utils import asr_utils
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
                                                update_local_model)
 from modelscope.utils.constant import Frameworks, ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['VoiceActivityDetectionPipeline']
@PIPELINES.register_module(
    Tasks.voice_activity_detection, module_name=Pipelines.vad_inference)
 class VoiceActivityDetectionPipeline(Pipeline):
    """Voice Activity Detection Inference Pipeline
    use `model` to create a Voice Activity Detection pipeline.
    Args:
        model: A model instance, or a model local dir, or a model id in the model hub.
        kwargs (dict, `optional`):
            Extra kwargs passed into the preprocessor's constructor.
    Example:
        >>> from modelscope.pipelines import pipeline
        >>> pipeline_vad = pipeline(
        >>>    task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch')
        >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm'
        >>> print(pipeline_vad(audio_in))
    """
    def __init__(self,
                 model: Union[Model, str] = None,
                 ngpu: int = 1,
                 **kwargs):
        """use `model` to create an vad pipeline for prediction
        """
        super().__init__(model=model, **kwargs)
        config_path = os.path.join(model, ModelFile.CONFIGURATION)
        self.cmd = self.get_cmd(config_path, kwargs, model)
        from funasr.bin import vad_inference_launch
        self.funasr_infer_modelscope = vad_inference_launch.inference_launch(
            mode=self.cmd['mode'],
            batch_size=self.cmd['batch_size'],
            dtype=self.cmd['dtype'],
            ngpu=ngpu,
            seed=self.cmd['seed'],
            num_workers=self.cmd['num_workers'],
            log_level=self.cmd['log_level'],
            key_file=self.cmd['key_file'],
            vad_infer_config=self.cmd['vad_infer_config'],
            vad_model_file=self.cmd['vad_model_file'],
            vad_cmvn_file=self.cmd['vad_cmvn_file'],
            **kwargs,
        )
    def __call__(self,
                 audio_in: Union[str, bytes],
                 audio_fs: int = None,
                 recog_type: str = None,
                 audio_format: str = None,
                 output_dir: str = None,
                 param_dict: dict = None,
                 **kwargs) -> Dict[str, Any]:
        """
        Decoding the input audios
        Args:
            audio_in('str' or 'bytes'):
                - A string containing a local path to a wav file
                - A string containing a local path to a scp
                - A string containing a wav url
                - A bytes input
            audio_fs('int'):
                frequency of sample
            recog_type('str'):
                recog type for wav file or datasets file ('wav', 'test', 'dev', 'train')
            audio_format('str'):
                audio format ('pcm', 'scp', 'kaldi_ark', 'tfrecord')
            output_dir('str'):
                output dir
            param_dict('dict'):
                extra kwargs
        Return:
            A dictionary of result or a list of dictionary of result.
            The dictionary contain the following keys:
            - **text** ('str') --The vad result.
        """
        self.audio_in = None
        self.raw_inputs = None
        self.recog_type = recog_type
        self.audio_format = audio_format
        self.audio_fs = None
        checking_audio_fs = None
        if output_dir is not None:
            self.cmd['output_dir'] = output_dir
        if param_dict is not None:
            self.cmd['param_dict'] = param_dict
        if isinstance(audio_in, str):
            # for funasr code, generate wav.scp from url or local path
            self.audio_in, self.raw_inputs = generate_scp_from_url(audio_in)
        elif isinstance(audio_in, bytes):
            self.audio_in = audio_in
            self.raw_inputs = None
        else:
            import numpy
            import torch
            if isinstance(audio_in, torch.Tensor):
                self.audio_in = None
                self.raw_inputs = audio_in
            elif isinstance(audio_in, numpy.ndarray):
                self.audio_in = None
                self.raw_inputs = audio_in
        # set the sample_rate of audio_in if checking_audio_fs is valid
        if checking_audio_fs is not None:
            self.audio_fs = checking_audio_fs
        if recog_type is None or audio_format is None:
            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
                audio_in=self.audio_in,
                recog_type=recog_type,
                audio_format=audio_format)
        if hasattr(asr_utils,
                   'sample_rate_checking') and self.audio_in is not None:
            checking_audio_fs = asr_utils.sample_rate_checking(
                self.audio_in, self.audio_format)
            if checking_audio_fs is not None:
                self.audio_fs = checking_audio_fs
        if audio_fs is not None:
            self.cmd['fs']['audio_fs'] = audio_fs
        else:
            self.cmd['fs']['audio_fs'] = self.audio_fs
        output = self.forward(self.audio_in, **kwargs)
        result = self.postprocess(output)
        return result
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        """Postprocessing
        """
        rst = {}
        for i in range(len(inputs)):
            if i == 0:
                text = inputs[0]['value']
                if len(text) > 0:
                    rst[OutputKeys.TEXT] = text
            else:
                rst[inputs[i]['key']] = inputs[i]['value']
        return rst
    def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]:
        model_cfg = json.loads(open(config_path).read())
        model_dir = os.path.dirname(config_path)
        # generate inference command
        vad_model_path = os.path.join(
            model_dir, model_cfg['model']['model_config']['vad_model_name'])
        vad_model_config = os.path.join(
            model_dir, model_cfg['model']['model_config']['vad_model_config'])
        vad_cmvn_file = os.path.join(
            model_dir, model_cfg['model']['model_config']['vad_mvn_file'])
        mode = model_cfg['model']['model_config']['mode']
        frontend_conf = None
        if os.path.exists(vad_model_config):
            config_file = open(vad_model_config, encoding='utf-8')
            root = yaml.full_load(config_file)
            config_file.close()
            if 'frontend_conf' in root:
                frontend_conf = root['frontend_conf']
        update_local_model(model_cfg['model']['model_config'], model_path,
                           extra_args)
        cmd = {
            'mode': mode,
            'batch_size': 1,
            'dtype': 'float32',
            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
            'seed': 0,
            'num_workers': 0,
            'log_level': 'ERROR',
            'key_file': None,
            'vad_infer_config': vad_model_config,
            'vad_model_file': vad_model_path,
            'vad_cmvn_file': vad_cmvn_file,
            'output_dir': None,
            'param_dict': None,
            'fs': {
                'model_fs': None,
                'audio_fs': None
            }
        }
        if frontend_conf is not None and 'fs' in frontend_conf:
            cmd['fs']['model_fs'] = frontend_conf['fs']
        user_args_dict = [
            'output_dir', 'batch_size', 'mode', 'ngpu', 'param_dict',
            'num_workers', 'fs'
        ]
        for user_args in user_args_dict:
            if user_args in extra_args:
                if extra_args.get(user_args) is not None:
                    cmd[user_args] = extra_args[user_args]
                del extra_args[user_args]
        return cmd
    def forward(self, audio_in: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        """Decoding
        """
        logger.info('VAD Processing ...')
        # generate inputs
        data_cmd: Sequence[Tuple[str, str, str]]
        if isinstance(self.audio_in, bytes):
            data_cmd = [self.audio_in, 'speech', 'bytes']
        elif isinstance(self.audio_in, str):
            data_cmd = [self.audio_in, 'speech', 'sound']
        elif self.raw_inputs is not None:
            data_cmd = None
        self.cmd['name_and_type'] = data_cmd
        self.cmd['raw_inputs'] = self.raw_inputs
        self.cmd['audio_in'] = self.audio_in
        vad_result = self.run_inference(self.cmd, **kwargs)
        return vad_result
    def run_inference(self, cmd, **kwargs):
        vad_result = []
        if self.framework == Frameworks.torch:
            vad_result = self.funasr_infer_modelscope(
                data_path_and_name_and_type=cmd['name_and_type'],
                raw_inputs=cmd['raw_inputs'],
                output_dir_v2=cmd['output_dir'],
                fs=cmd['fs'],
                param_dict=cmd['param_dict'],
                **kwargs)
        else:
            raise ValueError('model type is mismatching')
        return vad_result
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -396,7 +396,6 @@ class Pipeline(ABC):
        assert not self.has_multiple_models, 'default implementation does not support multiple models in a pipeline.'
        return self.model(inputs, **forward_params)
    @abstractmethod
    def postprocess(self, inputs: Dict[str, Any],
                    **post_params) -> Dict[str, Any]:
        """ If current pipeline support model reuse, common postprocess
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -51,14 +51,12 @@ class TranslationPipeline(Pipeline):
        self._src_vocab_path = osp.join(
            model, self.cfg['dataset']['src_vocab']['file'])
-        self._src_vocab = dict([
+        self._src_vocab = dict([(w.strip(), i) for i, w in enumerate(
-            (w.strip(), i) for i, w in enumerate(open(self._src_vocab_path, encoding='utf-8'))
+            open(self._src_vocab_path, encoding='utf-8'))])
        ])
        self._trg_vocab_path = osp.join(
            model, self.cfg['dataset']['trg_vocab']['file'])
-        self._trg_rvocab = dict([
+        self._trg_rvocab = dict([(i, w.strip()) for i, w in enumerate(
-            (i, w.strip()) for i, w in enumerate(open(self._trg_vocab_path, encoding='utf-8'))
+            open(self._trg_vocab_path, encoding='utf-8'))])
        ])
        tf_config = tf.ConfigProto(allow_soft_placement=True)
        tf_config.gpu_options.allow_growth = True
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -251,6 +251,7 @@ class AudioTasks(object):
    speech_timestamp = 'speech-timestamp'
    speaker_diarization_dialogue_detection = 'speaker-diarization-dialogue-detection'
    speaker_diarization_semantic_speaker_turn_detection = 'speaker-diarization-semantic-speaker-turn-detection'
    emotion_recognition = 'emotion-recognition'
 class MultiModalTasks(object):
--- a/requirements/audio/audio_asr.txt
+++ b/requirements/audio/audio_asr.txt
@@ -1 +1 @@
-funasr>=0.6.5
+funasr>=1.0.0