add Mglm (#5)

* mglm init * add mglm requirements Co-authored-by: Yufeng <zhuyufeng@gmail.com> Co-authored-by: wenmeng.zwm <wenmeng.zwm@alibaba-inc.com>
2026-02-24 12:10:09 +01:00 · 2022-10-28 17:09:27 +08:00
parent 7dc02a79ef
commit 783cf26822
76 changed files with 22640 additions and 13 deletions
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -82,6 +82,7 @@ class Models(object):
    bert_for_ds = 'bert-for-document-segmentation'
    ponet = 'ponet'
    T5 = 'T5'
+    mglm = 'mglm'
    bloom = 'bloom'

    # audio models
@@ -251,6 +252,7 @@ class Pipelines(object):
    relation_extraction = 'relation-extraction'
    document_segmentation = 'document-segmentation'
    feature_extraction = 'feature-extraction'
+    mglm_text_summarization = 'mglm-text-summarization'
    translation_en_to_de = 'translation_en_to_de'  # keep it underscore
    translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
    translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
@@ -376,6 +378,7 @@ class Preprocessors(object):
    re_tokenizer = 're-tokenizer'
    document_segmentation = 'document-segmentation'
    feature_extraction = 'feature-extraction'
+    mglm_summarization = 'mglm-summarization'
    sentence_piece = 'sentence-piece'

    # audio preprocessor
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
        SbertTokenizerFast,
    )
    from .T5 import T5ForConditionalGeneration
+    from .mglm import MGLMForTextSummarization
    from .task_models import (
        FeatureExtractionModel,
        InformationExtractionModel,
@@ -106,6 +107,7 @@ else:
        ],
        'sentence_embedding': ['SentenceEmbedding'],
        'T5': ['T5ForConditionalGeneration'],
+        'mglm': ['MGLMForTextSummarization'],
        'gpt_neo': ['GPTNeoModel'],
        'bloom': ['BloomModel'],
    }
--- a/modelscope/models/nlp/mglm/init.py
+++ b/modelscope/models/nlp/mglm/init.py
@@ -0,0 +1,22 @@
+# Modified by Zhipu.AI
+# Original Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .mglm_for_text_summarization import mGlmForSummarization
+else:
+    _import_structure = {
+        'mglm_for_text_summarization': ['MGLMForTextSummarization'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
--- a/modelscope/models/nlp/mglm/arguments.py
+++ b/modelscope/models/nlp/mglm/arguments.py
@@ -0,0 +1,793 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""argparser configuration"""
+
+import argparse
+import os
+
+import deepspeed
+import json
+import torch
+
+from .utils import get_hostname
+
+
+def add_model_config_args(parser):
+    """Model arguments"""
+
+    group = parser.add_argument_group('model', 'model configuration')
+
+    group.add_argument(
+        '--transformer-xl',
+        action='store_true',
+        help='use transformer-xl for training')
+    group.add_argument(
+        '--pretrained-bert',
+        action='store_true',
+        help='use a pretrained bert-large-uncased model instead'
+        'of initializing from scratch. See '
+        '--tokenizer-model-type to specify which pretrained '
+        'BERT model to use')
+    group.add_argument(
+        '--encoder-decoder',
+        action='store_true',
+        help='use the encoder-decoder architecture for blocklm')
+    group.add_argument(
+        '--attention-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for attention weights')
+    group.add_argument(
+        '--num-attention-heads',
+        type=int,
+        default=16,
+        help='num of transformer attention heads')
+    group.add_argument(
+        '--hidden-size', type=int, default=1024, help='tansformer hidden size')
+    group.add_argument(
+        '--intermediate-size',
+        type=int,
+        default=None,
+        help='transformer embedding dimension for FFN'
+        'set to 4*`--hidden-size` if it is None')
+    group.add_argument(
+        '--num-layers', type=int, default=24, help='num decoder layers')
+    group.add_argument(
+        '--layernorm-epsilon',
+        type=float,
+        default=1e-5,
+        help='layer norm epsilon')
+    group.add_argument(
+        '--hidden-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for hidden state transformer')
+    group.add_argument(
+        '--output-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for pooled output')
+    group.add_argument(
+        '--max-position-embeddings',
+        type=int,
+        default=512,
+        help='maximum number of position embeddings to use')
+    group.add_argument(
+        '--vocab-size',
+        type=int,
+        default=250112,
+        help='vocab size to use for non-character-level '
+        'tokenization. This value will only be used when '
+        'creating a tokenizer')
+    group.add_argument(
+        '--deep-init',
+        action='store_true',
+        help='initialize bert model similar to gpt2 model.'
+        'scales initialization of projection layers by a '
+        'factor of 1/sqrt(2N). Necessary to train bert '
+        'models larger than BERT-Large.')
+    group.add_argument(
+        '--make-vocab-size-divisible-by',
+        type=int,
+        default=128,
+        help='Pad the vocab size to be divisible by this value.'
+        'This is added for computational efficieny reasons.')
+    group.add_argument(
+        '--cpu-optimizer', action='store_true', help='Run optimizer on CPU')
+    group.add_argument(
+        '--cpu_torch_adam',
+        action='store_true',
+        help='Use Torch Adam as optimizer on CPU.')
+
+    return parser
+
+
+def add_fp16_config_args(parser):
+    """Mixed precision arguments."""
+
+    group = parser.add_argument_group('fp16', 'fp16 configurations')
+
+    group.add_argument(
+        '--fp16', action='store_true', help='Run model in fp16 mode')
+    group.add_argument(
+        '--fp32-embedding', action='store_true', help='embedding in fp32')
+    group.add_argument(
+        '--fp32-layernorm', action='store_true', help='layer norm in fp32')
+    group.add_argument(
+        '--fp32-tokentypes',
+        action='store_true',
+        help='embedding token types in fp32')
+    group.add_argument(
+        '--fp32-allreduce', action='store_true', help='all-reduce in fp32')
+    group.add_argument(
+        '--hysteresis',
+        type=int,
+        default=2,
+        help='hysteresis for dynamic loss scaling')
+    group.add_argument(
+        '--loss-scale',
+        type=float,
+        default=None,
+        help='Static loss scaling, positive power of 2 '
+        'values can improve fp16 convergence. If None, dynamic'
+        'loss scaling is used.')
+    group.add_argument(
+        '--loss-scale-window',
+        type=float,
+        default=1000,
+        help='Window over which to raise/lower dynamic scale')
+    group.add_argument(
+        '--min-scale',
+        type=float,
+        default=1,
+        help='Minimum loss scale for dynamic loss scale')
+    group.add_argument('--attention-scale', type=float, default=1.0)
+    return parser
+
+
+def add_training_args(parser):
+    """Training arguments."""
+
+    group = parser.add_argument_group('train', 'training configurations')
+
+    group.add_argument(
+        '--experiment-name',
+        type=str,
+        default='gpt-345M',
+        help='The experiment name for summary and checkpoint')
+    group.add_argument(
+        '--batch-size', type=int, default=4, help='Data Loader batch size')
+    group.add_argument(
+        '--gradient-accumulation-steps',
+        type=int,
+        default=1,
+        help='Data Loader batch size')
+    group.add_argument(
+        '--weight-decay',
+        type=float,
+        default=0.01,
+        help='weight decay coefficient for L2 regularization')
+    group.add_argument(
+        '--checkpoint-activations',
+        action='store_true',
+        help='checkpoint activation to allow for training '
+        'with larger models and sequences')
+    group.add_argument(
+        '--checkpoint-num-layers',
+        type=int,
+        default=1,
+        help='chunk size (number of layers) for checkpointing')
+    group.add_argument(
+        '--deepspeed-activation-checkpointing',
+        action='store_true',
+        help='uses activation checkpointing from deepspeed')
+    group.add_argument(
+        '--epochs',
+        type=int,
+        default=None,
+        help='Number of finetunning epochs. Zero results in evaluation only.')
+    group.add_argument(
+        '--clip-grad', type=float, default=1.0, help='gradient clipping')
+    group.add_argument(
+        '--train-iters',
+        type=int,
+        default=0,
+        help='total number of iterations to train over all training runs')
+    group.add_argument('--label-smoothing', type=float, default=0.0)
+    group.add_argument(
+        '--log-interval', type=int, default=100, help='report interval')
+    group.add_argument(
+        '--summary-dir',
+        type=str,
+        default='',
+        help='The directory to store the summary')
+    group.add_argument('--seed', type=int, default=1234, help='random seed')
+    # Batch producer arguments
+    group.add_argument(
+        '--reset-position-ids',
+        action='store_true',
+        help='Reset posistion ids after end-of-document token.')
+    group.add_argument(
+        '--reset-attention-mask',
+        action='store_true',
+        help='Reset self attention maske after '
+        'end-of-document token.')
+
+    # Learning rate.
+    group.add_argument(
+        '--lr-decay-iters',
+        type=int,
+        default=None,
+        help='number of iterations to decay LR over,'
+        ' If None defaults to `--train-iters`*`--epochs`')
+    group.add_argument(
+        '--lr-decay-style',
+        type=str,
+        default='linear',
+        choices=['constant', 'linear', 'cosine', 'exponential'],
+        help='learning rate decay function')
+    group.add_argument('--lr-decay-ratio', type=float, default=0.1)
+    group.add_argument(
+        '--lr', type=float, default=1.0e-4, help='initial learning rate')
+    group.add_argument(
+        '--warmup',
+        type=float,
+        default=0.01,
+        help='percentage of data to warmup on (.01 = 1% of all '
+        'training iters). Default 0.01')
+    group.add_argument(
+        '--switch-linear',
+        action='store_true',
+        help='Switch to linear decay for cosine decay')
+    # model checkpointing
+    group.add_argument(
+        '--save',
+        type=str,
+        default=None,
+        help='Output directory to save checkpoints to.')
+    group.add_argument('--new-save-directory', action='store_true')
+    group.add_argument(
+        '--save-epoch',
+        type=int,
+        default=1,
+        help='number of epochs between saves')
+    group.add_argument(
+        '--save-interval',
+        type=int,
+        default=5000,
+        help='number of iterations between saves')
+    group.add_argument(
+        '--no-save-optim',
+        action='store_true',
+        help='Do not save current optimizer.')
+    group.add_argument(
+        '--no-save-rng',
+        action='store_true',
+        help='Do not save current rng state.')
+    group.add_argument(
+        '--load',
+        type=str,
+        default=None,
+        help='Path to a directory containing a model checkpoint.')
+    group.add_argument(
+        '--no-load-optim',
+        action='store_true',
+        help='Do not load optimizer when loading checkpoint.')
+    group.add_argument(
+        '--no-load-rng',
+        action='store_true',
+        help='Do not load rng state when loading checkpoint.')
+    group.add_argument(
+        '--no-load-lr-scheduler',
+        action='store_true',
+        help='Do not load lr scheduler when loading checkpoint.')
+    group.add_argument(
+        '--no-deepspeed-load',
+        action='store_true',
+        help='Not use deepspeed when loading checkpoint')
+    group.add_argument(
+        '--finetune',
+        action='store_true',
+        help='Load model for finetuning. Do not load optimizer '
+        'or rng state from checkpoint and set iteration to 0. '
+        'Assumed when loading a release checkpoint.')
+    group.add_argument(
+        '--resume-dataloader',
+        action='store_true',
+        help='Resume the dataloader when resuming training. '
+        'Does not apply to tfrecords dataloader, try resuming'
+        'with a different seed in this case.')
+    # distributed training args
+    group.add_argument(
+        '--distributed-backend',
+        default='nccl',
+        help=
+        'which backend to use for distributed training. One of [gloo, nccl]',
+        choices=['nccl', 'gloo'])
+    group.add_argument(
+        '--DDP-impl',
+        default='torch',
+        choices=['local', 'torch', 'none'],
+        help='which DistributedDataParallel implementation to use.')
+
+    group.add_argument(
+        '--local_rank',
+        type=int,
+        default=None,
+        help='local rank passed from distributed launcher')
+    # BlockLM training args
+    group.add_argument(
+        '--block-lm',
+        action='store_true',
+        help='whether use the BlockLM pre-training')
+    group.add_argument(
+        '--masked-lm',
+        action='store_true',
+        help='whether to use the mlm objective')
+    group.add_argument('--bert-prob', type=float, default=0.5)
+    group.add_argument('--gpt-infill-prob', type=float, default=0.5)
+    group.add_argument('--gpt-min-ratio', type=float, default=0.5)
+    group.add_argument('--gap-sentence-prob', type=float, default=0.0)
+    group.add_argument('--gap-sentence-ratio', type=float, default=0.15)
+    group.add_argument('--avg-block-length', type=int, default=3)
+    group.add_argument('--short-seq-prob', type=float, default=0.0)
+    group.add_argument('--single-span-prob', type=float, default=0.0)
+    group.add_argument(
+        '--task-mask',
+        action='store_true',
+        help='Use different mask for generation and blank filling')
+    group.add_argument(
+        '--no-shuffle-block',
+        action='store_true',
+        help='not shuffle the blocks when filling the blank')
+    group.add_argument(
+        '--no-block-position',
+        action='store_true',
+        help='Use (rough) absolute positions instead of block positions')
+    group.add_argument(
+        '--sentinel-token',
+        action='store_true',
+        help='Use sentinel (mask) tokens to replace 2d position encoding')
+    group.add_argument('--block-mask-prob', type=float, default=0.0)
+    group.add_argument('--context-mask-ratio', type=float, default=0.0)
+    group.add_argument(
+        '--random-position',
+        action='store_true',
+        help='Use random start position to cover all the position embeddings')
+    return parser
+
+
+def add_evaluation_args(parser):
+    """Evaluation arguments."""
+
+    group = parser.add_argument_group('validation',
+                                      'validation configurations')
+
+    group.add_argument(
+        '--eval-batch-size',
+        type=int,
+        default=None,
+        help='Data Loader batch size for evaluation datasets.'
+        'Defaults to `--batch-size`')
+    group.add_argument(
+        '--eval-iters',
+        type=int,
+        default=100,
+        help='number of iterations to run for evaluation'
+        'validation/test for')
+    group.add_argument(
+        '--eval-interval',
+        type=int,
+        default=1000,
+        help='interval between running evaluation on validation set')
+    group.add_argument(
+        '--eval-epoch',
+        type=int,
+        default=1,
+        help='epoch between running evaluation on validation set')
+    group.add_argument(
+        '--eval-seq-length',
+        type=int,
+        default=None,
+        help='Maximum sequence length to process for '
+        'evaluation. Defaults to `--seq-length`')
+    group.add_argument(
+        '--eval-max-preds-per-seq',
+        type=int,
+        default=None,
+        help='Maximum number of predictions to use for '
+        'evaluation. Defaults to '
+        'math.ceil(`--eval-seq-length`*.15/10)*10')
+    group.add_argument('--overlapping-eval', type=int, default=32)
+
+    return parser
+
+
+def add_text_generate_args(parser):
+    """Text generate arguments."""
+
+    group = parser.add_argument_group('Text generation', 'configurations')
+    group.add_argument('--temperature', type=float, default=1.0)
+    group.add_argument('--top_p', type=float, default=0.0)
+    group.add_argument('--top_k', type=int, default=0)
+    group.add_argument('--out-seq-length', type=int, default=256)
+    group.add_argument('--num-beams', type=int, default=1)
+    group.add_argument('--length-penalty', type=float, default=0.0)
+    group.add_argument('--no-repeat-ngram-size', type=int, default=0)
+    group.add_argument('--min-tgt-length', type=int, default=0)
+    group.add_argument('--select-topk', action='store_true')
+    group.add_argument('--blank-maskratio', type=float, default=0.1)
+    return parser
+
+
+def add_data_args(parser):
+    """Train/valid/test data arguments."""
+
+    group = parser.add_argument_group('data', 'data configurations')
+
+    group.add_argument(
+        '--model-parallel-size',
+        type=int,
+        default=1,
+        help='size of the model parallel.')
+    group.add_argument(
+        '--shuffle',
+        action='store_true',
+        help='Shuffle data. Shuffling is deterministic '
+        'based on seed and current epoch.')
+    group.add_argument('--filter-english', action='store_true')
+    group.add_argument(
+        '--train-data',
+        nargs='+',
+        default=None,
+        help='Whitespace separated filenames or corpora names '
+        'for training.')
+    group.add_argument(
+        '--valid-data',
+        nargs='*',
+        default=None,
+        help="""Filename for validation data.""")
+    group.add_argument(
+        '--test-data',
+        nargs='*',
+        default=None,
+        help="""Filename for testing""")
+    group.add_argument(
+        '--data-dir',
+        type=str,
+        default=None,
+        help='The data path to all the data files')
+    group.add_argument(
+        '--input-data-sizes-file',
+        type=str,
+        default='sizes.txt',
+        help='the filename containing all the shards sizes')
+
+    group.add_argument(
+        '--delim', default=',', help='delimiter used to parse csv data files')
+    group.add_argument(
+        '--text-key',
+        default='sentence',
+        help='key to use to extract text from json/csv')
+    group.add_argument(
+        '--eval-text-key',
+        default=None,
+        help='key to use to extract text from '
+        'json/csv evaluation datasets')
+    group.add_argument(
+        '--split',
+        default='1000,1,1',
+        help='comma-separated list of proportions for training,'
+        ' validation, and test split')
+
+    group.add_argument(
+        '--no-lazy-loader',
+        action='store_true',
+        help='whether to lazy read the data set')
+    group.add_argument('--half-lazy-loader', action='store_true')
+    group.add_argument(
+        '--loader-scatter',
+        type=int,
+        default=None,
+        help='Number of scatters to use for dataloaders')
+    group.add_argument(
+        '--loose-json',
+        action='store_true',
+        help='Use loose json (one json-formatted string per '
+        'newline), instead of tight json (data file is one '
+        'json string)')
+    group.add_argument(
+        '--presplit-sentences',
+        action='store_true',
+        help='Dataset content consists of documents where '
+        'each document consists of newline separated sentences')
+    group.add_argument(
+        '--num-workers',
+        type=int,
+        default=2,
+        help="""Number of workers to use for dataloading""")
+    group.add_argument(
+        '--tokenizer-model-type',
+        type=str,
+        default=None,
+        help="Model type to use for sentencepiece tokenization \
+                       (one of ['bpe', 'char', 'unigram', 'word']) or \
+                       bert vocab to use for BertWordPieceTokenizer (one of \
+                       ['bert-large-uncased', 'bert-large-cased', etc.])")
+    group.add_argument(
+        '--tokenizer-path',
+        type=str,
+        default='tokenizer.model',
+        help='path used to save/load sentencepiece tokenization '
+        'models')
+    group.add_argument(
+        '--tokenizer-type',
+        type=str,
+        default='BertWordPieceTokenizer',
+        choices=[
+            'CharacterLevelTokenizer', 'SentencePieceTokenizer',
+            'BertWordPieceTokenizer', 'GPT2BPETokenizer', 'ChineseSPTokenizer'
+        ],
+        help='what type of tokenizer to use')
+    group.add_argument('--no-pre-tokenize', action='store_true')
+    group.add_argument(
+        '--cache-dir',
+        default=None,
+        type=str,
+        help='Where to store pre-trained BERT downloads')
+    group.add_argument(
+        '--use-tfrecords',
+        action='store_true',
+        help='load `--train-data`, `--valid-data`, '
+        '`--test-data` from BERT tf records instead of '
+        'normal data pipeline')
+    group.add_argument(
+        '--seq-length',
+        type=int,
+        default=512,
+        help='Maximum sequence length to process')
+    group.add_argument(
+        '--mem-length',
+        type=int,
+        default=0,
+        help='The memory length to preserve')
+    group.add_argument(
+        '--max-preds-per-seq',
+        type=int,
+        default=None,
+        help='Maximum number of predictions to use per sequence.'
+        'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
+        'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
+    group.add_argument('--non-sentence-start', type=float, default=0.0)
+    group.add_argument(
+        '--sample-one-document',
+        action='store_true',
+        help='only sample one document in one sample')
+    group.add_argument(
+        '--load-splits',
+        type=str,
+        default=None,
+        help='The path to load split indices from')
+    group.add_argument(
+        '--save-splits',
+        type=str,
+        default=None,
+        help='The path to save split indices to')
+    group.add_argument(
+        '--save-test-data',
+        type=str,
+        default=None,
+        help='The path to save the test data')
+    group.add_argument(
+        '--multi-task-data',
+        nargs='*',
+        default=None,
+        help='Downsteam task names for multi-task pre-training')
+    group.add_argument(
+        '--multi-task-ratio',
+        type=float,
+        default=0.0,
+        help='Ratio for multi-task pre-training')
+    group.add_argument('--multi-seq-length', type=int, default=None)
+    group.add_argument('--multi-batch-size', type=int, default=None)
+    return parser
+
+
+def add_finetune_config_args(parser):
+    group = parser.add_argument_group('finetune', 'finetune configurations')
+    group.add_argument('--task', type=str, help='Task name.')
+    group.add_argument(
+        '--load-pretrained',
+        type=str,
+        help='Load pretrained model',
+        default=None)
+    group.add_argument(
+        '--pool-token',
+        type=str,
+        choices=['start', 'pad', 'cls'],
+        help='The token to pool the sequence representation',
+        default='cls')
+    group.add_argument(
+        '--cloze-eval',
+        action='store_true',
+        help='Evaluation dataset with cloze task')
+    group.add_argument(
+        '--multi-token',
+        action='store_true',
+        help='Use multi token for cloze evaluation')
+    group.add_argument(
+        '--segment-length',
+        type=int,
+        default=0,
+        help='The maximum segment length for cloze evaluation')
+    group.add_argument(
+        '--loss-func',
+        type=str,
+        choices=['cross_entropy', 'hinge', 'generative', 'mix'],
+        default='cross_entropy')
+    group.add_argument('--block-lm-ratio', type=float, default=0.0)
+    group.add_argument(
+        '--adapet',
+        action='store_true',
+        help='Use the decoupled cross entropy loss in AdaPET')
+    group.add_argument('--pattern-id', type=int, default=0)
+    group.add_argument(
+        '--fast-decode',
+        action='store_true',
+        help=
+        'Fast decode for multi-token cloze. Can only be used without checkpoint activation.'
+    )
+    group.add_argument('--few-superglue', action='store_true')
+    group.add_argument(
+        '--eval-valid',
+        action='store_true',
+        help='Whether evaluate on the valid set')
+    group.add_argument('--validation-metric', type=str, default=None)
+    group.add_argument(
+        '--unidirectional',
+        action='store_true',
+        help='Use the left to right language model')
+    group.add_argument('--src-seq-length', type=int, default=None)
+    group.add_argument('--tgt-seq-length', type=int, default=None)
+    group.add_argument('--adam-beta1', type=float, default=0.9)
+    group.add_argument('--adam-beta2', type=float, default=0.999)
+    group.add_argument('--adam-eps', type=float, default=1e-8)
+    group.add_argument(
+        '--optimizer', type=str, choices=['adam', 'adafactor'], default='adam')
+    group.add_argument('--wsc-negative', action='store_true')
+    group.add_argument('--overwrite', action='store_true')
+    group.add_argument('--no-validation', action='store_true')
+    # Continuous prompt arguments
+    group.add_argument(
+        '--continuous-prompt',
+        action='store_true',
+        help='Use continuous prompt for PET')
+    group.add_argument('--num-prompt-tokens', type=int, default=0)
+    group.add_argument(
+        '--prompt-func', default='lstm', choices=['lstm', 'mlp', 'none'])
+    group.add_argument(
+        '--freeze-transformer', action='store_true', default=False)
+    group.add_argument('--tune-prefix-layers', type=int, default=None)
+    group.add_argument('--prefix-prompt', type=int, default=0)
+    group.add_argument('--prompt-init', action='store_true', default=False)
+    return parser
+
+
+def get_args():
+    """Parse all the args."""
+
+    parser = argparse.ArgumentParser(description='PyTorch BERT Model')
+    parser = add_model_config_args(parser)
+    parser = add_fp16_config_args(parser)
+    parser = add_training_args(parser)
+    parser = add_evaluation_args(parser)
+    parser = add_text_generate_args(parser)
+    parser = add_data_args(parser)
+    parser = add_finetune_config_args(parser)
+
+    # Include DeepSpeed configuration arguments
+    parser = deepspeed.add_config_arguments(parser)
+
+    args = parser.parse_args(args=[])
+    if not args.train_data and not args.data_dir:
+        print('WARNING: No training data specified')
+
+    args.cuda = torch.cuda.is_available()
+
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv('WORLD_SIZE', '1'))
+    if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
+        mpi_define_env(args)
+    elif os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
+        # We are using (OpenMPI) mpirun for launching distributed data parallel processes
+        local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
+        local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+
+        # Possibly running with Slurm
+        num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
+        nodeid = int(os.getenv('SLURM_NODEID', '0'))
+
+        args.local_rank = local_rank
+        args.rank = nodeid * local_size + local_rank
+        args.world_size = num_nodes * local_size
+
+    args.model_parallel_size = min(args.model_parallel_size, args.world_size)
+    if args.rank == 0:
+        print('using world size: {} and model-parallel size: {} '.format(
+            args.world_size, args.model_parallel_size))
+
+    args.dynamic_loss_scale = False
+    if args.loss_scale is None:
+        args.dynamic_loss_scale = True
+        if args.rank == 0:
+            print(' > using dynamic loss scaling')
+
+    # The args fp32_* or fp16_* meant to be active when the
+    # args fp16 is set. So the default behaviour should all
+    # be false.
+    if not args.fp16:
+        args.fp32_embedding = False
+        args.fp32_tokentypes = False
+        args.fp32_layernorm = False
+
+    if hasattr(args, 'deepspeed'
+               ) and args.deepspeed and args.deepspeed_config is not None:
+        with open(args.deepspeed_config) as file:
+            deepspeed_config = json.load(file)
+        if 'train_micro_batch_size_per_gpu' in deepspeed_config:
+            args.batch_size = deepspeed_config[
+                'train_micro_batch_size_per_gpu']
+        if 'gradient_accumulation_steps' in deepspeed_config:
+            args.gradient_accumulation_steps = deepspeed_config[
+                'gradient_accumulation_steps']
+        else:
+            args.gradient_accumulation_steps = 1
+        if 'optimizer' in deepspeed_config:
+            optimizer_params_config = deepspeed_config['optimizer'].get(
+                'params', {})
+            args.lr = optimizer_params_config.get('lr', args.lr)
+            args.weight_decay = optimizer_params_config.get(
+                'weight_decay', args.weight_decay)
+    return args
+
+
+def mpi_define_env(args):
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    world_size = comm.Get_size()
+
+    master_addr = None
+    if rank == 0:
+        master_addr = get_hostname()
+    master_addr = comm.bcast(master_addr, root=0)
+
+    # Determine local rank by assuming hostnames are unique
+    proc_name = MPI.Get_processor_name()
+    all_procs = comm.allgather(proc_name)
+    local_rank = sum([i == proc_name for i in all_procs[:rank]])
+
+    os.environ['RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    args.local_rank = local_rank
+    args.world_size = world_size
+    args.rank = rank
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ[
+        'MASTER_PORT'] = '29500'  # TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
+
+    print(
+        'Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}'
+        .format(os.environ['RANK'], args.local_rank, os.environ['WORLD_SIZE'],
+                os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']))
--- a/modelscope/models/nlp/mglm/blocklm_utils.py
+++ b/modelscope/models/nlp/mglm/blocklm_utils.py
@@ -0,0 +1,625 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import copy
+import math
+import random
+
+import numpy as np
+import torch
+import torch.utils.data
+from scipy.stats import poisson
+
+from . import mpu
+from .utils import print_rank_0
+
+
+def rindex(lst, val, start=None):
+    if start is None:
+        start = len(lst) - 1
+    for i in range(start, -1, -1):
+        if lst[i] == val:
+            return i
+    return -1
+
+
+def index_in_list(lst, val, start=None):
+    if start is None:
+        start = 0
+    for i in range(start, len(lst)):
+        if lst[i] == val:
+            return i
+    return -1
+
+
+class ConstructBlockStrategy:
+
+    def __init__(self,
+                 args,
+                 tokenizer,
+                 max_seq_length,
+                 bert_prob=1.0,
+                 gap_sentence_prob=0.0,
+                 gpt_infill_prob=0.5,
+                 gpt_min_ratio=0.5,
+                 bert_ratio=0.15,
+                 gap_sentence_ratio=0.15,
+                 average_block_length=3,
+                 max_block_length=40,
+                 block_mask_prob=0.0,
+                 context_mask_ratio=0.0,
+                 context_mask_range=3,
+                 short_seq_prob=0.0,
+                 single_span_prob=0.0,
+                 block_position_encoding=True,
+                 encoder_decoder=False,
+                 shuffle_blocks=True,
+                 sentinel_token=False,
+                 task_mask=False,
+                 random_position=False,
+                 masked_lm=False):
+        self.eod_token = args.eod_token
+        self.tokenizer = tokenizer
+        self.count = 0
+        self.max_seq_length = max_seq_length
+        self.rank = mpu.get_data_parallel_rank()
+        self.world_size = mpu.get_data_parallel_world_size()
+        # self.rank = 0
+        # self.world_size = 1
+        assert 0.0 <= bert_prob <= 1.0
+        self.bert_prob = bert_prob
+        self.gap_sentence_prob = gap_sentence_prob
+        self.gpt_prob = 1 - bert_prob - gap_sentence_prob
+        assert self.gpt_prob >= -1e-10
+        self.infill_prob = gpt_infill_prob
+        self.gpt_min_ratio = gpt_min_ratio
+        self.bert_ratio = bert_ratio
+        self.gap_sentence_ratio = gap_sentence_ratio
+        self.block_length_distribution = [
+            poisson.pmf(i, average_block_length)
+            for i in range(1, max_block_length)
+        ]
+        self.block_mask_prob = block_mask_prob
+        self.context_mask_ratio = context_mask_ratio
+        self.context_mask_range = context_mask_range
+        self.short_seq_prob = short_seq_prob
+        self.single_span_prob = single_span_prob
+        self.block_position_encoding = block_position_encoding
+        self.encoder_decoder = encoder_decoder
+        self.shuffle_blocks = shuffle_blocks
+        self.sentinel_token = sentinel_token
+        self.generation_mask = 'gMASK' if task_mask else 'MASK'
+        self.generation_mask = self.tokenizer.get_command(
+            self.generation_mask).Id
+        self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
+        self.gap_sentence_mask = self.tokenizer.get_command(
+            self.gap_sentence_mask).Id
+        self.random_position = random_position
+        self.masked_lm = masked_lm
+        print_rank_0(
+            f'BERT prob {self.bert_prob}, gap sent prob {self.gap_sentence_prob}, GPT prob {self.gpt_prob}, infill prob {self.infill_prob}'  # noqa
+        )
+        print_rank_0(
+            f'generation min ratio {self.gpt_min_ratio}, block ratio {self.bert_ratio}, gap sent ratio {self.gap_sentence_ratio}'  # noqa
+        )
+        print_rank_0(
+            f'block length distribution {self.block_length_distribution}')
+        print_rank_0(
+            f'block mask prob {self.block_mask_prob}, context mask ratio {self.context_mask_ratio}'
+        )
+
+    def contains_sentence_end(self, tok):
+        tok = self.tokenizer.IdToToken(tok)
+        if '.' in tok:
+            return True
+        if '?' in tok:
+            return True
+        if '!' in tok:
+            return True
+        if ';' in tok:
+            return True
+        if ':' in tok:
+            return True
+        if '。' in tok:
+            return True
+        if '？' in tok:
+            return True
+        if '！' in tok:
+            return True
+        if '；' in tok:
+            return True
+        if '…' in tok:
+            return True
+        if '\n' in tok:
+            return True
+        return False
+
+    @staticmethod
+    def sample_spans(span_lengths, total_length, rng, offset=0):
+        blank_length = total_length - sum(span_lengths)
+        m = blank_length - len(span_lengths) + 1
+        places = [rng.randrange(m + 1) for _ in range(len(span_lengths))]
+        places.sort()
+        spans = []
+        for place, span_length in zip(places, span_lengths):
+            start = offset + place
+            end = offset + place + span_length
+            spans.append((start, end))
+            offset += span_length + 1
+        return spans
+
+    def sample_span_in_document(self, tokens, masked_lengths, rng):
+        rng.shuffle(masked_lengths)
+        mask_spans = []
+        mask_index = 0
+        indices = [-1] + np.where(tokens == self.eod_token)[0].tolist()
+        last_index = len(tokens)
+        documents = []
+        for index in reversed(indices):
+            start_index = index
+            if start_index + 1 < len(tokens) and tokens[
+                    start_index + 1] == self.tokenizer.get_command('ENC').Id:
+                start_index += 1
+            length = last_index - start_index - 1
+            if last_index == len(tokens) and length > 0:
+                length -= 1
+            documents.append((start_index + 1, length))
+            last_index = index
+        documents.sort(key=lambda x: x[1])
+        for i, (offset, length) in enumerate(documents):
+            if i == len(documents) - 1:
+                current_masked_length, current_count = 0, 0
+                while mask_index + current_count < len(
+                        masked_lengths
+                ) and masked_lengths[
+                        mask_index +  # noqa
+                        current_count] + current_masked_length + current_count <= length:
+                    current_masked_length += masked_lengths[mask_index
+                                                            + current_count]
+                    current_count += 1
+                if current_count > 0:
+                    spans = self.sample_spans(
+                        masked_lengths[mask_index:mask_index + current_count],
+                        length,
+                        rng,
+                        offset=offset)
+                    mask_spans += spans
+                if mask_index + current_count < len(masked_lengths) - 1:
+                    print(length, masked_lengths[mask_index:],
+                          masked_lengths[:mask_index], indices)
+            else:
+                current_masked_total = int(length * self.bert_ratio)
+                current_masked_length, current_count = 0, 0
+                while mask_index + current_count < len(
+                        masked_lengths
+                ) and masked_lengths[
+                        mask_index +  # noqa
+                        current_count] + current_masked_length <= current_masked_total:
+                    current_masked_length += masked_lengths[mask_index
+                                                            + current_count]
+                    current_count += 1
+                if current_count > 0:
+                    spans = self.sample_spans(
+                        masked_lengths[mask_index:mask_index + current_count],
+                        length,
+                        rng,
+                        offset=offset)
+                    mask_spans += spans
+                    mask_index += current_count
+        return mask_spans
+
+    def make_masked_data(self,
+                         tokens,
+                         loss_masks,
+                         attention_mask,
+                         block_spans,
+                         rng,
+                         task='bert'):
+        position_ids = np.arange(len(tokens), dtype=np.long)
+        targets = copy.deepcopy(tokens)
+        mask_id = self.tokenizer.get_command('MASK').Id
+        mlm_masks = np.zeros(len(tokens), dtype=np.long)
+        for start, end in block_spans:
+            for idx in range(start, end):
+                tokens[idx] = mask_id
+            mlm_masks[start:end] = 1
+        loss_masks = loss_masks * mlm_masks
+        return tokens, targets, loss_masks, position_ids
+
+    def make_block_data(self,
+                        tokens,
+                        loss_masks,
+                        attention_mask,
+                        block_spans,
+                        rng,
+                        task='bert'):
+        text_length = len(tokens)
+        position_ids = np.ones(len(tokens), dtype=np.long)
+        for start, end in block_spans:
+            position_ids[start + 1:end] = 0
+        position_ids = np.cumsum(position_ids) - 1
+        if self.random_position and position_ids[-1] < self.max_seq_length - 1:
+            position_bias = self.max_seq_length - position_ids[-1]
+            position_bias = rng.randrange(0, position_bias)
+            position_ids = position_ids + position_bias
+        if self.encoder_decoder or not self.shuffle_blocks:
+            block_spans.sort(key=lambda x: x[0])
+        else:
+            rng.shuffle(block_spans)
+        if self.sentinel_token:
+            block_spans = [(start, end, idx)
+                           for idx, (start, end) in enumerate(block_spans)]
+        else:
+            block_spans = [(start, end, 0) for start, end in block_spans]
+        target_tokens, target_position_ids, target_block_position_ids, targets = [], [], [], []
+        for start, end, idx in block_spans:
+            sop_token = 'sop' if idx == 0 else f'sop{idx}'
+            target_tokens.append([self.tokenizer.get_command(sop_token).Id])
+            span_tokens = copy.deepcopy(tokens[start:end])
+            if self.block_mask_prob > 0.0 and task == 'bert':
+                for sub_idx in range(len(span_tokens)):
+                    if random.random() < self.block_mask_prob:
+                        span_tokens[sub_idx] = self.tokenizer.get_command(
+                            'dBLOCK').Id
+            target_tokens.append(span_tokens)
+            targets.append(tokens[start:end])
+            targets.append([self.tokenizer.get_command('eop').Id])
+            if not self.sentinel_token:
+                target_position_id = position_ids[start:end]
+                target_position_ids.append(target_position_id)
+                target_position_ids.append([target_position_id[0]])
+            else:
+                target_position_ids.append([self.max_seq_length] *  # noqa
+                                           (end - start + 1))
+            if self.block_position_encoding:
+                target_block_position_ids.append(
+                    np.arange(1, end - start + 2, dtype=np.long))
+            else:
+                target_block_position_ids.append([1] * (end - start + 1))
+        block_spans.sort(key=lambda x: x[0])
+        source_tokens, source_position_ids, local_spans = [], [], []
+        last, current_length = 0, 0
+        for start, end, idx in block_spans:
+            if task == 'generation':
+                mask_id = self.generation_mask
+            elif task == 'gap_sentence':
+                mask_id = self.gap_sentence_mask
+            else:
+                mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
+                mask_id = self.tokenizer.get_command(mask_token).Id
+            local_spans.append((current_length, current_length + start - last))
+            source_tokens.append(tokens[last:start])
+            source_tokens.append([mask_id])
+            source_position_ids.append(position_ids[last:start])
+            source_position_ids.append([position_ids[start]])
+            current_length += start - last + 1
+            last = end
+        if last < len(tokens):
+            local_spans.append(
+                (current_length, current_length + len(tokens) - last))
+            source_tokens.append(tokens[last:])
+            source_position_ids.append(position_ids[last:])
+        source_length = sum(map(len, source_tokens))
+        if attention_mask is not None:
+            assert source_length == attention_mask
+        if target_tokens and self.eod_token in np.concatenate(
+                target_tokens).tolist():
+            print('Found EOS in target', self.tokenizer.DecodeIds(tokens))
+            raise RuntimeError
+        if self.encoder_decoder:
+            target_tokens = target_tokens + [
+                self.tokenizer.get_command('eop').Id
+            ]
+            loss_masks = np.ones(len(target_tokens), dtype=np.long)
+            return source_tokens, target_tokens, loss_masks
+        else:
+            tokens = np.concatenate(source_tokens + target_tokens)
+            if task == 'bert' and self.context_mask_ratio > 0:
+                mask_candidates = set()
+                for start, end in local_spans:
+                    if start != 0:
+                        local_end = min(end, start + self.context_mask_range)
+                        mask_candidates.update(range(start, local_end))
+                    if end != 0:
+                        local_start = max(start, end - self.context_mask_range)
+                        mask_candidates.update(range(local_start, end))
+                mask_pos = rng.sample(
+                    mask_candidates,
+                    int(self.context_mask_ratio * text_length))
+                for pos in mask_pos:
+                    tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
+            targets = np.concatenate(source_tokens + targets)
+            loss_masks = np.ones(len(tokens), dtype=np.long)
+            loss_masks[:source_length] = 0
+            position_ids = np.concatenate(source_position_ids
+                                          + target_position_ids)
+            block_position_ids = np.concatenate(
+                [np.zeros(source_length, dtype=np.long)]
+                + target_block_position_ids)
+            position_ids = np.stack([position_ids, block_position_ids], axis=0)
+            if attention_mask is not None:
+                return tokens, targets, loss_masks, position_ids
+            else:
+                return tokens, targets, loss_masks, position_ids, source_length
+
+    def generate_blank_data(self,
+                            sample,
+                            masked_lengths,
+                            attention_mask,
+                            rng,
+                            task='bert'):
+        rng.shuffle(masked_lengths)
+        tokens, loss_masks = sample['text'], sample['loss_mask']
+        assert tokens[0] == self.tokenizer.get_command('ENC').Id
+        block_spans = self.sample_span_in_document(tokens, masked_lengths, rng)
+        if len(block_spans) < len(masked_lengths):
+            return None
+        if self.masked_lm:
+            data = self.make_masked_data(tokens, loss_masks, attention_mask,
+                                         block_spans, rng)
+        else:
+            data = self.make_block_data(
+                tokens,
+                loss_masks,
+                attention_mask,
+                block_spans,
+                rng,
+                task=task)
+        return data
+
+    def split_samples(self, samples, rng):
+        target_length = rng.randrange(32, self.max_seq_length - 1)
+        num_splits = (self.max_seq_length - 1) // target_length
+        new_samples = []
+        cls_id = self.tokenizer.get_command('ENC').Id
+        eos_id = self.tokenizer.get_command('eos').Id
+        for sample in samples:
+            tokens, loss_masks = sample['text'][1:], sample['loss_mask'][1:]
+            for _ in range(num_splits):
+                if target_length >= len(tokens):
+                    new_tokens, new_loss_masks = tokens, loss_masks
+                else:
+                    random_start = rng.randrange(0,
+                                                 len(tokens) - target_length)
+                    while random_start > 0 and (
+                            tokens[random_start] == eos_id or  # noqa
+                            not (self.contains_sentence_end(  # noqa
+                                tokens[random_start - 1]) or  # noqa
+                                 tokens[random_start - 1] == eos_id)):  # noqa
+                        random_start -= 1
+                    random_end = random_start + target_length
+                    while random_end > random_start and not (
+                            self.contains_sentence_end(tokens[random_end - 1])
+                            or tokens[random_end - 1] == eos_id):
+                        random_end -= 1
+                    if random_end - random_start < target_length // 2:
+                        random_end = random_start + target_length
+                    new_tokens, new_loss_masks = tokens[
+                        random_start:random_end], loss_masks[
+                            random_start:random_end]
+                new_tokens = np.concatenate(([cls_id], new_tokens))
+                new_loss_masks = np.concatenate(([0], new_loss_masks))
+                new_samples.append({
+                    'text': new_tokens,
+                    'loss_mask': new_loss_masks
+                })
+        return new_samples
+
+    def construct_blocks(self, samples):
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is not None:
+            worker_id, num_workers = worker_info.id, worker_info.num_workers
+        else:
+            worker_id, num_workers = 0, 1
+        rng = random.Random((self.count * num_workers + worker_id)
+                            * self.world_size + self.rank)
+        self.count += 1
+        token_batch, target_batch, loss_mask_batch, position_id_batch = [], [], [], []
+        source_batch, target_batch = [], []
+        if rng.random() < self.short_seq_prob:
+            samples = self.split_samples(samples, rng)
+        rand = rng.random()
+        single_span = rand < self.single_span_prob
+        rand = 0.0 if single_span else rng.random()
+        attention_mask = []
+        if rand < self.bert_prob:
+            mode = 'bert'
+            for sample in samples:
+                if single_span:
+                    masked_lengths = [
+                        rng.choices(
+                            range(1,
+                                  len(self.block_length_distribution) + 1),
+                            weights=self.block_length_distribution)[0]
+                    ]
+                    masked_count = masked_lengths[0]
+                else:
+                    masked_lengths, masked_count = [], 0
+                    while masked_count < int(
+                            self.bert_ratio * len(sample['text'])):
+                        block_length = rng.choices(
+                            range(1,
+                                  len(self.block_length_distribution) + 1),
+                            weights=self.block_length_distribution)[0]
+                        masked_lengths.append(block_length)
+                        masked_count += block_length
+                if self.masked_lm:
+                    sep = len(sample['text'])
+                else:
+                    sep = len(
+                        sample['text']) - masked_count + len(masked_lengths)
+                data = self.generate_blank_data(
+                    sample, masked_lengths, sep, rng, task='bert')
+                if data is not None:
+                    if self.encoder_decoder:
+                        source_tokens, target_tokens, loss_masks = data
+                        source_batch.append(source_tokens)
+                        target_batch.append(target_tokens)
+                        loss_mask_batch.append(loss_masks)
+                    else:
+                        tokens, targets, loss_masks, position_ids = data
+                        token_batch.append(tokens)
+                        target_batch.append(targets)
+                        loss_mask_batch.append(loss_masks)
+                        position_id_batch.append(position_ids)
+                    attention_mask.append(sep)
+
+        elif rand < self.bert_prob + self.gap_sentence_prob:
+            mode = 'sentence'
+            for sample in samples:
+                tokens, loss_masks = sample['text'], sample['loss_mask']
+                sentence_spans = []
+                last_index = 1 if tokens[0] == self.tokenizer.get_command(
+                    'ENC').Id else 0
+                for i in range(len(tokens)):
+                    if self.contains_sentence_end(tokens[i]):
+                        if last_index < i + 1:
+                            sentence_spans.append((last_index, i + 1))
+                        last_index = i + 1
+                    elif tokens[i] == self.tokenizer.get_command('eos').Id:
+                        last_index = i + 1
+                if last_index < len(tokens):
+                    sentence_spans.append((last_index, len(tokens)))
+                if not sentence_spans and torch.distributed.get_rank() == 0:
+                    try:
+                        print(self.tokenizer.DecodeIds(tokens[1:]))
+                    except IndexError:
+                        print(tokens[1:])
+                rng.shuffle(sentence_spans)
+                block_spans, block_length = [], 0
+                for start, end in sentence_spans:
+                    block_spans.append((start, end))
+                    block_length += end - start
+                    if block_length >= int(
+                            self.gap_sentence_ratio * len(tokens)):
+                        break
+                data = self.make_block_data(
+                    tokens,
+                    loss_masks,
+                    None,
+                    block_spans,
+                    rng,
+                    task='gap_sentence')
+                tokens, targets, loss_masks, position_ids, sep = data
+                token_batch.append(tokens)
+                target_batch.append(targets)
+                loss_mask_batch.append(loss_masks)
+                position_id_batch.append(position_ids)
+                attention_mask.append(sep)
+        else:
+            # start_indices = [index_in_list(sample['loss_mask'], 1) for sample in samples]
+            # end_indices = [rindex(sample['loss_mask'], 1) for sample in samples]
+            # start_index, end_index = max(start_indices), min(end_indices) - self.min_generation_length
+            # if end_index < start_index + 1:
+            #     end_index = start_index + 1
+            # division = rng.randrange(start_index, end_index)
+            mode = 'gpt'
+            max_generation_length = rng.randint(
+                int(self.gpt_min_ratio
+                    * min(map(lambda x: len(x['text']), samples))),
+                max(map(lambda x: len(x['text']), samples)) - 2)
+            for sample in samples:
+                generation_length = min(max_generation_length,
+                                        len(sample['text']) - 2)
+                attention_mask.append(
+                    len(sample['text']) - generation_length + 1)
+                multiple_doc = index_in_list(
+                    sample['text'],
+                    self.tokenizer.get_command('eos').Id) not in [
+                        -1, len(sample['text']) - 1
+                    ]  # noqa
+                if multiple_doc or rng.random() < self.infill_prob:
+                    division = len(sample['text']) - generation_length
+                    tokens, loss_masks = sample['text'], sample['loss_mask']
+                    source_tokens, target_tokens = tokens[:division], tokens[
+                        division:]
+                    target_masks = loss_masks[division:]
+                    tokens = np.concatenate((source_tokens, [
+                        self.generation_mask,
+                        self.tokenizer.get_command('sop').Id
+                    ], target_tokens[:-1]))
+                    targets = np.concatenate(
+                        (source_tokens, [self.generation_mask], target_tokens))
+                    loss_masks = np.concatenate(
+                        (np.zeros(len(source_tokens) + 1,
+                                  dtype=np.long), target_masks))
+                    token_batch.append(tokens)
+                    target_batch.append(targets)
+                    loss_mask_batch.append(loss_masks)
+                    position_ids = np.arange(
+                        len(source_tokens) + len(target_tokens) + 1,
+                        dtype=np.long)
+                    position_ids[len(source_tokens) + 1:] = len(source_tokens)
+                    if self.block_position_encoding:
+                        block_position_ids = np.concatenate(
+                            (np.zeros(len(source_tokens), dtype=np.long),
+                             np.arange(len(target_tokens) + 1, dtype=np.long)))
+                    else:
+                        block_position_ids = np.concatenate(
+                            (np.zeros(len(source_tokens) + 1, dtype=np.long),
+                             np.ones(len(target_tokens) + 1, dtype=np.long)))
+                    position_id_batch.append(
+                        np.stack([position_ids, block_position_ids], axis=0))
+                else:
+                    tokens, targets, loss_masks, position_ids = self.generate_blank_data(
+                        sample, [generation_length],
+                        attention_mask[-1],
+                        rng,
+                        task='generation')
+                    token_batch.append(tokens)
+                    target_batch.append(targets)
+                    loss_mask_batch.append(loss_masks)
+                    position_id_batch.append(position_ids)
+                    if tokens is None:
+                        print(sample, generation_length, multiple_doc)
+        if self.encoder_decoder:
+            return {
+                'text': torch.tensor(source_batch, dtype=torch.long),
+                'target': torch.tensor(target_batch, dtype=torch.long),
+                'loss_mask': torch.tensor(loss_mask_batch, dtype=torch.long)
+            }
+        else:
+            token_batch, target_batch, loss_mask_batch, position_id_batch = self.pad_batch(
+                token_batch, target_batch, loss_mask_batch, position_id_batch)
+            return {
+                'text': torch.tensor(token_batch, dtype=torch.long),
+                'target': torch.tensor(target_batch, dtype=torch.long),
+                'loss_mask': torch.tensor(loss_mask_batch, dtype=torch.long),
+                'position_id':
+                torch.tensor(position_id_batch, dtype=torch.long),
+                'attention_mask':
+                torch.tensor(attention_mask, dtype=torch.long),
+                'mode': mode
+            }
+
+    @staticmethod
+    def pad_batch(token_batch, target_batch, loss_mask_batch,
+                  position_id_batch):
+        seq_lengths = list(map(len, token_batch))
+        if seq_lengths.count(seq_lengths[0]) != len(seq_lengths):
+            max_length = max(seq_lengths)
+            token_batch = [
+                np.concatenate(
+                    (tokens, np.zeros(max_length - len(tokens),
+                                      dtype=np.long)))
+                for tokens in token_batch
+            ]
+            target_batch = [
+                np.concatenate(
+                    (targets,
+                     np.zeros(max_length - len(targets), dtype=np.long)))
+                for targets in target_batch
+            ]
+            loss_mask_batch = [
+                np.concatenate(
+                    (loss_masks,
+                     np.zeros(max_length - len(loss_masks), dtype=np.long)))
+                for loss_masks in loss_mask_batch
+            ]
+            position_id_batch = [
+                np.concatenate((position_ids,
+                                np.zeros(
+                                    (2, max_length - position_ids.shape[1]),
+                                    dtype=np.long)),
+                               axis=1) for position_ids in position_id_batch
+            ]
+        return token_batch, target_batch, loss_mask_batch, position_id_batch
--- a/modelscope/models/nlp/mglm/configure_data.py
+++ b/modelscope/models/nlp/mglm/configure_data.py
@@ -0,0 +1,513 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""parses arguments and preps data loader"""
+
+import copy
+import os
+import random
+from bisect import bisect_right
+from itertools import accumulate
+
+import numpy as np
+import torch
+import torch.utils.data
+
+from . import data_utils, mpu
+from .blocklm_utils import ConstructBlockStrategy
+from .data_utils.tokenization import make_tokenizer
+from .utils import print_rank_0
+
+
+class MultiTaskDataset(torch.utils.data.Dataset):
+
+    def __init__(self,
+                 tasks,
+                 datasets,
+                 reweight=True,
+                 temperature=0.8,
+                 max_limit=200000):
+        super(MultiTaskDataset, self).__init__()
+        self.tasks = tasks
+        self.datasets = datasets
+        self.reweight = reweight
+        self.temperature = temperature
+        self.lens = [len(dataset) for dataset in datasets]
+        self.weights = np.array(
+            [min(length, max_limit)**temperature for length in self.lens])
+        self.total_len = sum(self.lens)
+        self.cumulative_lens = list(accumulate(self.lens))
+        if self.reweight:
+            print_rank_0(list(zip(self.tasks, self.lens, self.weights)))
+        else:
+            print_rank_0(list(zip(self.tasks, self.lens)))
+        self.weights /= self.weights.sum()
+
+    def __len__(self):
+        return self.total_len * 1000
+
+    @staticmethod
+    def pet_wrapper(data):
+        text = data['text']
+        loss_mask = data['logit_mask']
+        target = data['target']
+        attention_mask = data['mask']
+        position_id = data['position']
+        label = data['label']
+        if len(text.shape) == 2:
+            text = text[label]
+            loss_mask = loss_mask[label]
+            target = target[label]
+            attention_mask = attention_mask[label]
+            position_id = position_id[label]
+        else:
+            target = target[label]
+        if not target.shape:
+            target = target.repeat(len(text))
+        return {
+            'text': text,
+            'target': target,
+            'loss_mask': loss_mask,
+            'position_id': position_id,
+            'attention_mask': attention_mask
+        }
+
+    def __getitem__(self, idx):
+        if self.reweight:
+            rng = random.Random(idx)
+            rng = np.random.RandomState(
+                seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
+            dataset_idx = rng.choice(
+                np.arange(len(self.datasets)), p=self.weights)
+            dataset = self.datasets[dataset_idx]
+            sample_idx = rng.choice(np.arange(len(dataset)))
+            item = self.datasets[dataset_idx][sample_idx]
+        else:
+            dataset_idx = bisect_right(self.cumulative_lens, idx)
+            if dataset_idx == 0:
+                sample_idx = idx
+            else:
+                sample_idx = idx - self.cumulative_lens[dataset_idx - 1]
+            item = self.datasets[dataset_idx][sample_idx]
+        item = self.pet_wrapper(item)
+        return item
+
+
+class DataConfig:
+
+    def __init__(self, defaults=None):
+        super(DataConfig, self).__init__()
+        if defaults is None:
+            defaults = {}
+        self.defaults = defaults
+
+    def apply(self, args, tokenizer):
+        if torch.distributed.get_rank() == 0:
+            print('configuring data')
+        self.apply_defaults(args)
+        return make_loaders(args, tokenizer)
+
+    def set_defaults(self, **kwargs):
+        for k, v in kwargs.items():
+            self.defaults[k] = v
+
+    def apply_defaults(self, args):
+        for k, v in self.defaults.items():
+            k = k.replace('-', '_')
+            if not hasattr(args, k):
+                setattr(args, k, v)
+
+
+def prepare_tokenizer(args):
+    add_sentinel_token = 0
+    if args.sentinel_token:
+        add_sentinel_token = args.max_position_embeddings
+    tokenizer = make_tokenizer(
+        args.tokenizer_type,
+        None,
+        args.tokenizer_path,
+        args.vocab_size,
+        args.tokenizer_model_type,
+        add_block_symbols=args.block_lm,
+        cache_dir=args.cache_dir,
+        add_sentinel_token=add_sentinel_token,
+        add_task_mask=args.task_mask,
+        add_decoder_mask=args.block_mask_prob > 0.0
+        or args.context_mask_ratio > 0.0)
+    if mpu.get_model_parallel_rank() == 0:
+        num_tokens = tokenizer.num_tokens
+        eod_token = tokenizer.get_command('eos').Id
+        assert eod_token == tokenizer.get_command('pad').Id
+        before = num_tokens
+        after = before
+        multiple = args.make_vocab_size_divisible_by
+        while (after % multiple) != 0:
+            after += 1
+        print_rank_0('> padded vocab (size: {}) with {} dummy '
+                     'tokens (new size: {})'.format(before, after - before,
+                                                    after))
+        print_rank_0('> found end-of-document token: {}'.format(eod_token))
+        token_counts = torch.cuda.LongTensor([after, eod_token])
+    else:
+        token_counts = torch.cuda.LongTensor([0, 0])
+    # Broadcast num tokens.
+    torch.distributed.broadcast(
+        token_counts,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    num_tokens = token_counts[0].item()
+    eod_token = token_counts[1].item()
+    args.vocab_size, args.eod_token = num_tokens, eod_token
+    return tokenizer
+
+
+def make_data_loader(dataset,
+                     tokenizer,
+                     batch_size,
+                     num_iters,
+                     args,
+                     shuffle=False,
+                     block_collate=False):
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
+    if args.loader_scatter is not None:
+        rank = rank // args.loader_scatter
+        world_size = world_size // args.loader_scatter
+        batch_size = batch_size // args.loader_scatter
+    distributed = world_size > 1
+    if args.transformer_xl:
+        batch_sampler = data_utils.samplers.DistributedSequentialSampler(
+            len(dataset), num_iters, batch_size, rank, world_size)
+    else:
+        if shuffle:
+            sampler = data_utils.samplers.RandomSampler(
+                dataset,
+                replacement=True,
+                num_samples=batch_size * args.train_iters
+                * args.gradient_accumulation_steps)
+        else:
+            sampler = torch.utils.data.SequentialSampler(dataset)
+        drop_last = distributed
+        # the GPUs in the same model parallel group receive the same data
+        if distributed:
+            batch_sampler = data_utils.samplers.DistributedBatchSampler(
+                sampler,
+                batch_size,
+                drop_last,
+                rank,
+                world_size,
+                gradient_accumulation_steps=args.gradient_accumulation_steps)
+        else:
+            batch_sampler = torch.utils.data.BatchSampler(
+                sampler, batch_size, drop_last)
+    collate_fn = None
+    if block_collate:
+        collate_fn = ConstructBlockStrategy(
+            args,
+            tokenizer,
+            args.seq_length,
+            bert_prob=args.bert_prob,
+            gap_sentence_prob=args.gap_sentence_prob,
+            gap_sentence_ratio=args.gap_sentence_ratio,
+            gpt_infill_prob=args.gpt_infill_prob,
+            average_block_length=args.avg_block_length,
+            gpt_min_ratio=args.gpt_min_ratio,
+            block_mask_prob=args.block_mask_prob,
+            context_mask_ratio=args.context_mask_ratio,
+            short_seq_prob=args.short_seq_prob,
+            single_span_prob=args.single_span_prob,
+            shuffle_blocks=not args.no_shuffle_block,
+            block_position_encoding=not args.no_block_position,
+            sentinel_token=args.sentinel_token,
+            encoder_decoder=args.encoder_decoder,
+            task_mask=args.task_mask,
+            random_position=args.random_position,
+            masked_lm=args.masked_lm).construct_blocks
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_sampler=batch_sampler,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        collate_fn=collate_fn)
+
+    return data_loader
+
+
+def make_tfrecord_loaders(args):
+    """Load train/val/test dataset from shuffled TFRecords"""
+
+    import data_utils.tf_dl
+    data_set_args = {
+        'batch_size': args.batch_size,
+        'max_seq_len': args.seq_length,
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'train': True,
+        'num_workers': max(args.num_workers, 1),
+        'seed': args.seed + args.rank + 1,
+        'threaded_dl': args.num_workers > 0
+    }
+    train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
+                                                **data_set_args)
+    data_set_args['train'] = False
+    if args.eval_seq_length is not None:
+        data_set_args['max_seq_len'] = args.eval_seq_length
+    if args.eval_max_preds_per_seq is not None:
+        data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
+    valid = None
+    if args.valid_data is not None:
+        valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
+                                                    **data_set_args)
+    test = None
+    if args.test_data is not None:
+        test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
+                                                   **data_set_args)
+    tokenizer = data_utils.make_tokenizer(
+        args.tokenizer_type,
+        train,
+        args.tokenizer_path,
+        args.vocab_size,
+        args.tokenizer_model_type,
+        cache_dir=args.cache_dir)
+
+    return (train, valid, test), tokenizer
+
+
+def make_loaders(args, tokenizer):
+    """makes training/val/test"""
+
+    if args.use_tfrecords:
+        return make_tfrecord_loaders(args)
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    if args.loader_scatter is not None:
+        assert world_size % args.loader_scatter == 0
+    batch_size = args.batch_size * world_size
+    eval_batch_size = batch_size
+    if args.eval_batch_size is not None:
+        eval_batch_size = args.eval_batch_size * world_size
+    seq_length = args.seq_length
+    if seq_length < 0:
+        seq_length = seq_length * world_size
+    eval_seq_length = args.eval_seq_length
+    if eval_seq_length is not None and eval_seq_length < 0:
+        eval_seq_length = eval_seq_length * world_size
+    split = get_split(args)
+    data_set_args = {
+        'path': args.train_data,
+        'seq_length': seq_length,
+        'mem_length': args.mem_length,
+        'delim': args.delim,
+        'text_key': args.text_key,
+        'label_key': 'label',
+        'ds_type': args.data_set_type,
+        'split': split,
+        'loose': args.loose_json,
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'presplit_sentences': args.presplit_sentences,
+        'sample_one_document': args.sample_one_document,
+        'filter_english': args.filter_english,
+        'pre_tokenize': not args.no_pre_tokenize,
+        'tokenizer': tokenizer,
+        'save_splits': args.save_splits,
+        'load_splits': args.load_splits,
+        'save_test_data': args.save_test_data,
+        'no_lazy_loader': args.no_lazy_loader,
+        'loader_scatter': args.loader_scatter,
+        'data_parallel_rank': mpu.get_data_parallel_rank(),
+        'non_sentence_start': args.non_sentence_start,
+        'half_lazy_loader': args.half_lazy_loader
+    }
+
+    eval_set_args = copy.copy(data_set_args)
+    eval_set_args['split'] = [1.]
+    # if optional eval args were set then replace their
+    # equivalent values in the arg dict
+    if eval_seq_length:
+        eval_set_args['seq_length'] = eval_seq_length
+    if args.eval_max_preds_per_seq:
+        eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
+    if args.eval_text_key is not None:
+        eval_set_args['text_key'] = args.eval_text_key
+
+    # make datasets splits and tokenizer
+    train, valid, test = None, None, None
+
+    if args.train_data is not None:
+        train = data_utils.make_dataset(**data_set_args)
+        if data_utils.should_split(split):
+            train, valid, test = train
+        eval_set_args['tokenizer'] = tokenizer
+
+    # make training and val dataset if necessary
+    if valid is None and args.valid_data is not None:
+        eval_set_args['path'] = args.valid_data
+        valid = data_utils.make_dataset(**eval_set_args)
+        eval_set_args['tokenizer'] = tokenizer
+    if test is None and args.test_data is not None:
+        eval_set_args['path'] = args.test_data
+        test = data_utils.make_dataset(**eval_set_args)
+
+    # wrap datasets with data loader
+    use_block = args.block_lm or args.encoder_decoder
+
+    if train is not None and args.batch_size > 0:
+        train = make_data_loader(
+            train,
+            tokenizer,
+            batch_size,
+            args.train_iters,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_train = True
+    else:
+        args.do_train = False
+    eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
+    if valid is not None:
+        valid = make_data_loader(
+            valid,
+            tokenizer,
+            eval_batch_size,
+            args.train_iters,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_valid = True
+    else:
+        args.do_valid = False
+    if test is not None:
+        test = make_data_loader(
+            test,
+            tokenizer,
+            eval_batch_size,
+            len(test) // eval_batch_size + 1,
+            args,
+            shuffle=args.shuffle,
+            block_collate=use_block)
+        args.do_test = True
+    else:
+        args.do_test = False
+
+    return train, valid, test
+
+
+def build_multi_task_dataset(args, tokenizer):
+    task_dirs = {
+        'mnli': 'MNLI',
+        'cola': 'CoLA',
+        'mrpc': 'MRPC',
+        'qnli': 'QNLI',
+        'qqp': 'QQP',
+        'sst2': 'SST-2',
+        'agnews': 'Agnews',
+        'yelp-polarity': 'yelp_review_polarity_csv',
+        'yelp-full': 'yelp_review_full_csv',
+        'yahoo': 'Yahoo',
+        'squad': 'SQuAD',
+        'race': 'RACE'
+    }
+    train, valid = None, None
+    if mpu.get_model_parallel_rank() == 0:
+        multi_seq_length = args.seq_length
+        if args.multi_seq_length is not None:
+            multi_seq_length = args.multi_seq_length
+        train_datasets, valid_datasets = [], []
+        for task in args.multi_task_data:
+            task = task.lower()
+            data_dir = os.path.join(args.data_dir, task_dirs[task])
+            train_datasets.append(
+                SuperGlueDataset(
+                    args,
+                    task,
+                    data_dir,
+                    multi_seq_length,
+                    'train',
+                    tokenizer,
+                    pattern_ensemble=True))
+            valid_datasets.append(
+                SuperGlueDataset(
+                    args,
+                    task,
+                    data_dir,
+                    multi_seq_length,
+                    'dev',
+                    tokenizer,
+                    pattern_ensemble=True))
+        train = MultiTaskDataset(args.multi_task_data, train_datasets)
+        valid = MultiTaskDataset(args.multi_task_data, valid_datasets)
+        world_size = torch.distributed.get_world_size(
+            group=mpu.get_data_parallel_group())
+        multi_batch_size = args.batch_size * world_size
+        if args.multi_batch_size is not None:
+            multi_batch_size = args.multi_batch_size * world_size
+        train = make_data_loader(
+            train,
+            tokenizer,
+            multi_batch_size,
+            args.train_iters,
+            args,
+            shuffle=True)
+        valid = make_data_loader(
+            valid,
+            tokenizer,
+            multi_batch_size,
+            args.train_iters,
+            args,
+            shuffle=True)
+    return train, valid
+
+
+def get_split(args):
+    """
+    Get dataset splits from comma separated string list
+    """
+    splits = []
+    if args.split.find(',') != -1:
+        splits = [float(s) for s in args.split.split(',')]
+    elif args.split.find('/') != -1:
+        splits = [float(s) for s in args.split.split('/')]
+    else:
+        splits = [float(args.split)]
+    split_total = sum(splits)
+    if split_total < 1.:
+        splits.append(1 - split_total)
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    if args.valid_data is not None:
+        splits[1] = 0.
+    if args.test_data is not None:
+        splits[2] = 0.
+    final_sum = sum(splits)
+    return [s / final_sum for s in splits]
+
+
+def configure_data():
+    """add cmdline flags for configuring datasets"""
+    # These are options that are used by data_utils, but are either
+    # deprecated or not meant to be exposed to the command line user.
+    # These options are intneded to be set in code by specific scripts.
+    defaults = {
+        'world_size': 1,
+        'rank': -1,
+        'persist_state': 0,
+        'lazy': False,
+        'transpose': False,
+        'data_set_type': 'supervised',
+        'seq_length': 256,
+        'eval_seq_length': 256,
+        'samples_per_shard': 100
+    }
+
+    return DataConfig(defaults=defaults)
--- a/modelscope/models/nlp/mglm/data_utils/init.py
+++ b/modelscope/models/nlp/mglm/data_utils/init.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for creating datasets"""
+import math
+import os
+import random
+import time
+
+import torch
+
+from . import corpora
+from .datasets import (BertSentencepairDataset, BlockDataset, ConcatDataset,
+                       GPT2Dataset, ShuffleDataset, SplitDataset, XLDataset,
+                       split_ds)
+from .lazy_loader import (LazyLoader, LazyWriter, exists_lazy, exists_scatter,
+                          get_scatter_path)
+from .samplers import DistributedBatchSampler
+from .tokenization import (BertWordPieceTokenizer, CharacterLevelTokenizer,
+                           CommandToken, GPT2BPETokenizer, Tokenization,
+                           Tokenizer, make_tokenizer)
+
+TRAIN_DATA = 0
+VAL_DATA = 1
+TEST_DATA = 2
+
+
+def should_split(split):
+    """
+    given split proportions checks if should split
+    Examples:
+    >>> should_split([10,0,0])
+    False
+    >>> should_split([1,.1,.2])
+    True
+    """
+    return max(split) / sum(split) != 1.
+
+
+def get_ext(path):
+    """gets path extension"""
+    return os.path.splitext(path)[1]
+
+
+def get_dataset(name,
+                tokenizer,
+                pre_tokenize,
+                data_parallel_rank,
+                loader_scatter=None,
+                no_lazy_loader=False,
+                half_lazy_loader=False):
+    """gets dataset object based on keyword args and file at `path`"""
+    global_rank = torch.distributed.get_rank()
+    if not supported_corpus(name):
+        raise NotImplementedError('dataset %s is not supported' % name)
+    dataset = corpora.NAMED_CORPORA[name]
+    path = dataset.PATH
+    if issubclass(dataset, corpora.PromptReader):
+        if not (exists_lazy(path, data_type='prompt')
+                and exists_lazy(path, data_type='text')) and not (
+                    loader_scatter is not None and exists_scatter(
+                        path, data_type='prompt', scatter_num=loader_scatter)
+                    and exists_scatter(
+                        path, data_type='text', scatter_num=loader_scatter)):
+            # create cached version of dataset for lazy loading if it doesn't exist
+            if global_rank == 0:
+                print(f'Creating lazy loader for dataset {name}')
+                prompt_writer = LazyWriter(
+                    path, data_type='prompt', is_array=pre_tokenize)
+                text_writer = LazyWriter(
+                    path, data_type='text', is_array=pre_tokenize)
+                writers = {'prompt': prompt_writer, 'text': text_writer}
+                reader = dataset(
+                    writers=writers,
+                    tokenizer=tokenizer,
+                    tokenize=pre_tokenize)
+                reader.process()
+                prompt_writer.close()
+                text_writer.close()
+            else:
+                while not os.path.exists(
+                        LazyWriter.get_len_path(path, data_type='prompt')):
+                    time.sleep(1)
+        map_fn = (lambda x: x.tolist()) if pre_tokenize else None
+        if loader_scatter is not None:
+            if not (exists_scatter(
+                    path, data_type='prompt', scatter_num=loader_scatter)
+                    and exists_scatter(
+                        path, data_type='text', scatter_num=loader_scatter)):
+                if global_rank == 0:
+                    print(f'Creating scatter loader for dataset {name}')
+                    prompts = LazyLoader(
+                        path,
+                        data_type='prompt',
+                        map_fn=map_fn,
+                        mem_map=True,
+                        is_array=pre_tokenize)
+                    texts = LazyLoader(
+                        path,
+                        data_type='text',
+                        map_fn=map_fn,
+                        mem_map=True,
+                        is_array=pre_tokenize)
+                    indices = list(range(len(texts)))
+                    random.shuffle(indices)
+                    segment_length = (len(indices) - 1) // loader_scatter + 1
+                    for i in range(loader_scatter):
+                        scatter_path = get_scatter_path(path, scatter_rank=i)
+                        prompt_writer = LazyWriter(
+                            scatter_path,
+                            data_type='prompt',
+                            is_array=pre_tokenize)
+                        text_writer = LazyWriter(
+                            scatter_path,
+                            data_type='text',
+                            is_array=pre_tokenize)
+                        for idx in indices[i * segment_length:(i + 1)
+                                           * segment_length]:
+                            prompt_writer.write(prompts[idx])
+                            text_writer.write(texts[idx])
+                        prompt_writer.close()
+                        text_writer.close()
+                else:
+                    while not (exists_scatter(
+                            path, data_type='prompt',
+                            scatter_num=loader_scatter) and exists_scatter(
+                                path,
+                                data_type='text',
+                                scatter_num=loader_scatter)):
+                        time.sleep(1)
+            scatter_path = get_scatter_path(
+                path, scatter_rank=data_parallel_rank % loader_scatter)
+            print(f'Rank {global_rank} is using scatter from {scatter_path}')
+            prompts = LazyLoader(
+                scatter_path,
+                data_type='prompt',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+            texts = LazyLoader(
+                scatter_path,
+                data_type='text',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+        else:
+            prompts = LazyLoader(
+                path,
+                data_type='prompt',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+            texts = LazyLoader(
+                path,
+                data_type='text',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize,
+                load_memory=no_lazy_loader,
+                half_load=half_lazy_loader)
+        text = corpora.PromptDataset(
+            prompt_loader=prompts,
+            text_loader=texts,
+            tokenizer=tokenizer,
+            to_tokenize=not pre_tokenize)
+        if loader_scatter is None:
+            if global_rank == 0:
+                print(f'Create dataset {name} with {len(text)} documents')
+                for i in range(10):
+                    rand_id = i if i < 5 else random.randrange(len(text))
+                    sample_tokens = text[rand_id]['tokens'][:1024]
+                    print(sample_tokens)
+                    print(tokenizer.DecodeIds(sample_tokens).encode('utf-8'))
+        else:
+            for scatter_id in range(loader_scatter):
+                if data_parallel_rank % loader_scatter == scatter_id and data_parallel_rank // loader_scatter == 0:
+                    print(
+                        f'Create dataset {name} at scatter {scatter_id} with {len(text)} documents'
+                    )
+                    for i in range(10):
+                        sample_tokens = text[i]['tokens'][:1024]
+                        print(sample_tokens)
+                        print(tokenizer.DecodeIds(sample_tokens))
+                torch.distributed.barrier()
+        return text
+    elif issubclass(dataset, corpora.KeyReader):
+        if not (exists_lazy(path, data_type='text')
+                and exists_lazy(path, data_type='mask')):
+            # create cached version of dataset for lazy loading if it doesn't exist
+            if global_rank == 0:
+                text_writer = LazyWriter(
+                    path, data_type='text', is_array=pre_tokenize)
+                mask_writer = LazyWriter(path, data_type='mask', is_array=True)
+                writers = {'mask': mask_writer, 'text': text_writer}
+                dataset(
+                    writers=writers,
+                    tokenizer=tokenizer,
+                    tokenize=pre_tokenize)
+                mask_writer.close()
+                text_writer.close()
+            else:
+                while not os.path.exists(
+                        LazyWriter.get_len_path(path, data_type='mask')):
+                    time.sleep(1)
+        map_fn = (lambda x: x.tolist()) if pre_tokenize else None
+        masks = LazyLoader(
+            path, data_type='mask', map_fn=map_fn, mem_map=True, is_array=True)
+        texts = LazyLoader(
+            path,
+            data_type='text',
+            map_fn=map_fn,
+            mem_map=True,
+            is_array=pre_tokenize)
+        text = corpora.KeyDataset(
+            mask_loader=masks,
+            text_loader=texts,
+            tokenizer=tokenizer,
+            to_tokenize=not pre_tokenize)
+        return text
+
+
+def supported_corpus(corpus_name):
+    """checks if corpus name is defined in `corpora.py`"""
+    return corpus_name in corpora.NAMED_CORPORA
+
+
+def make_dataset(path,
+                 seq_length,
+                 mem_length,
+                 shuffle=True,
+                 split=None,
+                 tokenizer=None,
+                 sample_one_document=False,
+                 pre_tokenize=False,
+                 ds_type='',
+                 save_splits=None,
+                 load_splits=None,
+                 save_test_data=None,
+                 no_lazy_loader=False,
+                 loader_scatter=None,
+                 data_parallel_rank=None,
+                 filter_english=False,
+                 non_sentence_start=0.0,
+                 half_lazy_loader=False,
+                 **kwargs):
+    """function to create datasets+tokenizers for common options"""
+    if split is None:
+        split = [1.]
+
+    # get one or multiple datasets and concatenate
+    if isinstance(path, str):
+        ds = get_dataset(
+            path,
+            tokenizer=tokenizer,
+            pre_tokenize=pre_tokenize,
+            no_lazy_loader=no_lazy_loader,
+            loader_scatter=loader_scatter,
+            data_parallel_rank=data_parallel_rank,
+            half_lazy_loader=half_lazy_loader)
+    else:
+        ds = [
+            get_dataset(
+                p,
+                tokenizer=tokenizer,
+                pre_tokenize=pre_tokenize,
+                no_lazy_loader=no_lazy_loader,
+                loader_scatter=loader_scatter,
+                data_parallel_rank=data_parallel_rank,
+                half_lazy_loader=half_lazy_loader) for p in path
+        ]
+        ds = ConcatDataset(ds)
+
+    # Split dataset into train/val/test (and wrap bert dataset)
+    def wrap_dataset(dataset):
+        if ds_type.lower() == 'bert':
+            presplit_sentences = kwargs[
+                'presplit_sentences'] if 'presplit_sentences' in kwargs else False
+            dataset = BertSentencepairDataset(
+                dataset,
+                max_seq_len=seq_length,
+                presplit_sentences=presplit_sentences)
+        elif ds_type.lower() == 'gpt-xl':
+            assert pre_tokenize
+            dataset = XLDataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                mem_len=mem_length,
+                sample_across_doc=not sample_one_document)
+        elif ds_type.lower() == 'gpt2':
+            dataset = GPT2Dataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                sample_across_doc=not sample_one_document)
+        elif ds_type.lower() == 'block':
+            dataset = BlockDataset(
+                dataset,
+                tokenizer,
+                max_seq_len=seq_length,
+                sample_across_doc=not sample_one_document,
+                filter_english=filter_english,
+                non_sentence_start=non_sentence_start)
+        return dataset
+
+    if should_split(split):
+        ds = split_ds(
+            ds,
+            split,
+            shuffle=shuffle,
+            save_splits=save_splits,
+            load_splits=load_splits)
+        if save_test_data is not None and torch.distributed.get_rank() == 0:
+            test_ds = ds[-1]
+            with open(save_test_data, 'w', encoding='utf-8') as output:
+                for data in test_ds:
+                    text = data['tokens']
+                    text = tokenizer.DecodeIds(text)
+                    output.write(text)
+                    output.write('\n')
+            print(f'Write test data to {save_test_data}')
+        ds = [wrap_dataset(d) if d is not None else None for d in ds]
+    else:
+        ds = wrap_dataset(ds)
+    return ds
--- a/modelscope/models/nlp/mglm/data_utils/corpora.py
+++ b/modelscope/models/nlp/mglm/data_utils/corpora.py
@@ -0,0 +1,583 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""several datasets with preset arguments"""
+import os
+import random
+from collections import defaultdict
+from multiprocessing import Process, Queue
+from queue import Empty
+
+import json
+import tqdm
+from torch.utils import data
+
+from modelscope.models.nlp.mglm.utils import print_rank_0
+from .datasets import csv_dataset, json_dataset
+from .lazy_loader import LazyLoader
+
+NUM_PROCESSES = 100
+
+
+def punctuation_standardization(string: str):
+    punctuation_dict = {
+        '\u201c': "\"",
+        '\u201d': "\"",
+        '\u2019': "'",
+        '\u2018': "'",
+        '\u2013': '-'
+    }
+    for key, value in punctuation_dict.items():
+        string = string.replace(key, value)
+    return string
+
+
+class KeyDataset(data.Dataset):
+
+    def __init__(self, text_loader, mask_loader, **kwargs):
+        self.texts = text_loader
+        self.masks = mask_loader
+        self.is_lazy = False
+        if isinstance(self.texts, LazyLoader) and isinstance(
+                self.masks, LazyLoader):
+            self.text_lens = self.texts.lens
+            self.is_lazy = True
+
+    def get_text_len(self, idx):
+        return self.text_lens[idx]
+
+    def __getitem__(self, index):
+        text = self.texts[index]
+        mask_length = self.masks[index]
+        mask = []
+        for i, length in enumerate(mask_length):
+            if i % 2 == 0:
+                mask += [0] * length
+            else:
+                mask += [1] * length
+        assert len(text) == len(mask)
+        return {'tokens': text, 'loss_masks': mask}
+
+    def __len__(self):
+        return len(self.texts)
+
+
+class PromptDataset(data.Dataset):
+
+    def __init__(self,
+                 prompt_loader,
+                 text_loader,
+                 tokenizer=None,
+                 to_tokenize=False,
+                 **kwargs):
+        self.prompts = prompt_loader
+        self.texts = text_loader
+        self.tokenizer = tokenizer
+        self.to_tokenize = to_tokenize
+        if isinstance(self.prompts, LazyLoader) and isinstance(
+                self.texts, LazyLoader):
+            self.prompt_lens = self.prompts.lens
+            self.text_lens = self.texts.lens
+            self.is_lazy = True
+
+    def get_text_len(self, idx):
+        return self.prompt_lens[idx] + self.text_lens[idx]
+
+    def __getitem__(self, index):
+        prompt = self.prompts[index]
+        text = self.texts[index]
+        if self.to_tokenize:
+            prompt = self.tokenizer.EncodeAsIds(prompt).tokenization
+            text = self.tokenizer.EncodeAsIds(text).tokenization
+        return {
+            'tokens': prompt + text,
+            'loss_masks': [0] * len(prompt) + [1] * len(text)
+        }
+
+    def __len__(self):
+        return len(self.prompts)
+
+
+class DataReader:
+    PATH = None
+    assert_str = None
+    reserve_punct = False
+    split_row = True
+    TASK_QUEUE_LIMIT = 10000000
+    DONE_QUEUE_LIMIT = 10000000
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        raise NotImplementedError
+
+    def print_info(self, info):
+        pass
+
+    def __init__(self, writers, tokenizer=None, tokenize=False, **kwargs):
+        print(self.PATH)
+        print(self.assert_str)
+        assert os.path.exists(self.PATH), self.assert_str
+        print_rank_0(f'Creating dataset from {self.PATH}')
+        self.tokenizer = tokenizer
+        self.tokenize = tokenize
+        self.writers = writers
+
+    def process(self):
+        if os.path.isdir(self.PATH):
+            paths = [
+                os.path.join(top, name) for top, _, names in os.walk(self.PATH)
+                for name in names
+            ]
+            # paths = [entry.path for entry in os.scandir(self.PATH) if
+            #          not entry.is_dir() and not entry.name.endswith("bz2")]
+        else:
+            paths = [self.PATH]
+        task_queue, done_queue, info_queue = Queue(
+            maxsize=self.TASK_QUEUE_LIMIT), Queue(
+                maxsize=self.DONE_QUEUE_LIMIT), Queue()
+        processes = []
+        for i in range(NUM_PROCESSES):
+            process = Process(
+                target=self.tokenize_worker,
+                args=(task_queue, done_queue, info_queue, self.tokenizer,
+                      self.tokenize))
+            process.start()
+            processes.append(process)
+
+        def read_input_to_queue():
+            for path in paths:
+                print_rank_0(f'Start reading {path}')
+                with open(path) as file:
+                    items = json.load(file)
+                    for item in items:
+                        task_queue.put(item)
+                    # if self.split_row:
+                    #     for row in file:
+                    #         task_queue.put(row)
+                    # else:
+                    #     items = json.load(file)
+                    #     for item in items["RECORDS"]:
+                    #         task_queue.put(item)
+            print_rank_0('Read input complete')
+            for i in range(len(processes)):
+                task_queue.put('STOP')
+
+        process = Process(target=read_input_to_queue)
+        process.start()
+        count = len(processes)
+        progress_bar = tqdm.tqdm()
+        while True:
+            data = done_queue.get()
+            if data == 'COMPLETE':
+                count -= 1
+                if count == 0:
+                    break
+            else:
+                self.write_result(data, self.writers)
+                progress_bar.update()
+        progress_bar.close()
+        self.print_info(info_queue)
+
+    @staticmethod
+    def write_result(data, writers):
+        raise NotImplementedError
+
+    @staticmethod
+    def get_token_count(contents):
+        return sum(map(len, contents))
+
+    @classmethod
+    def process_sample(cls, text, tokenizer, tokenize):
+        if isinstance(text, str) and tokenize:
+            if not cls.reserve_punct:
+                text = punctuation_standardization(text)
+            text = tokenizer.EncodeAsIds(text).tokenization if text else []
+        return text
+
+    @staticmethod
+    def trim_field(content, max_length):
+        if len(content) > max_length:
+            content = content[:max_length]
+            content += '......'
+        return content
+
+    def process_line(self, data, tokenizer, tokenize):
+        raise NotImplementedError
+
+
+class PromptReader(DataReader):
+    is_json = True
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        for row in iter(input.get, 'STOP'):
+            if row:
+                if self.is_json:
+                    row = row.rstrip()
+                    row = json.loads(row)
+                prompts, texts = self.process_line(row, tokenizer, tokenize)
+                for prompt, text in zip(prompts, texts):
+                    output.put((prompt, text))
+        output.put('COMPLETE')
+
+    @staticmethod
+    def write_result(data, writers):
+        prompt, text = data
+        writers['prompt'].write(prompt)
+        writers['text'].write(text)
+
+
+class KeyReader(DataReader):
+    PATH = '/root/data/wikipedia/wiki-key.txt'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        keys, contents = data['key'], data['content']
+        assert len(keys) == len(contents)
+        for i in range(1, len(keys)):
+            keys[i] = ' ' + keys[i]
+        contents = [' ' + content for content in contents]
+        keys = [tokenizer.EncodeAsIds(key).tokenization for key in keys]
+        contents = [
+            tokenizer.EncodeAsIds(content).tokenization for content in contents
+        ]
+        summary = sum(keys, [])
+        summary_prefix = self.process_sample('Summary: ', tokenizer, tokenize)
+        summary_mask = [len(summary_prefix), len(summary)]
+        summary = summary_prefix + summary
+        text, text_mask = [], []
+        for key, content in zip(keys, contents):
+            content = content + [tokenizer.get_command('eop').Id]
+            text += key
+            text += content
+            text_mask.append(len(key))
+            text_mask.append(len(content))
+        return (summary, summary_mask), (text, text_mask)
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        for row in iter(input.get, 'STOP'):
+            data = json.loads(row)
+            summary, content = self.process_line(data, tokenizer, tokenize)
+            output.put((summary, content))
+        output.put('COMPLETE')
+
+    @staticmethod
+    def write_result(data, writers):
+        summary, content = data
+        writers['text'].write(summary[0])
+        writers['mask'].write(summary[1])
+        writers['text'].write(content[0])
+        writers['mask'].write(content[1])
+
+
+class zhihu(PromptReader):
+    PATH = '/dataset/fd5061f6/data/tokenize_data/zhihu.lazy'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for zhihu data_utils/corpora.py'
+    qtitle_prefix = '问题：'
+    qcontent_prefix = '问题描述：'
+    user_prefix = '回答用户：'
+    answer_prefix = ' 回答：'
+
+    # qtitle_prefix = []
+    # qcontent_prefix = []
+    # user_prefix = []
+    # answer_prefix = []
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompts, texts = [], []
+        ans_length = len(data.get('ans-content', ''))
+        ans_up = data.get('ans-up-num', '')
+        ans_up = int(ans_up) if ans_up else 0
+        if ans_length > 100 or ans_up > 1000:
+            qtitle = data['q_title']
+            qcontent = data['q-content']
+            if qcontent is None:
+                qcontent = ''
+            qcontent = self.trim_field(qcontent, max_length=100)
+            user = data.get('user-signature', '')
+            prompt = self.qtitle_prefix + qtitle + self.qcontent_prefix + qcontent + self.user_prefix + user + self.answer_prefix  # noqa
+            text = data['ans-content']
+            prompt, text = self.process_sample(prompt, tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            prompts.append(prompt)
+            texts.append(text)
+        # prompt = data["q_title"] + data["q-content"] + data["user-signature"]
+        # text = data["ans-content"]
+        # prompts.append(prompt)
+        # texts.append(text)
+        return prompts, texts
+
+
+class zhidao(PromptReader):
+    PATH = '/root/data/zhidao/zhidao'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for zhidao data_utils/corpora.py'
+    qtitle_prefix = '问题：'
+    qcontent_prefix = '问题描述：'
+    answer_prefix = '回答：'
+
+    def process_line(self, data, tokenizer, tokenize):
+        if 'title' not in data:
+            return [], []
+        prompts, texts = [], []
+        qtitle = data['title']
+        qcontent = data.get('content', '')
+        qcontent = self.trim_field(qcontent, max_length=100)
+        prompt = self.qtitle_prefix + qtitle + self.qcontent_prefix + qcontent + self.answer_prefix
+        prompt = self.process_sample(prompt, tokenizer, tokenize)
+        if 'best_answer' in data:
+            text = data['best_answer']['content']
+            if len(text) > 10:
+                text = self.process_sample(text, tokenizer, tokenize)
+                prompts.append(prompt)
+                texts.append(text)
+        for answer in data.get('other_answers', []):
+            text = answer['content']
+            if len(text) > 100:
+                text = self.process_sample(text, tokenizer, tokenize)
+                prompts.append(prompt)
+                texts.append(text)
+        return prompts, texts
+
+
+class baike(PromptReader):
+    PATH = '/dataset/fd5061f6/data/tokenize_data/baike.lazy'
+    reserve_punct = True
+    assert_str = 'make sure to set PATH for baike data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompts, texts = [], []
+        text = data.get('title', '') + data.get('abstract', '') + data.get(
+            'content', '')
+        if text:
+            p, t = self.process_sample('', tokenizer,
+                                       tokenize), self.process_sample(
+                                           text, tokenizer, tokenize)
+            prompts.append(p)
+            texts.append(t)
+        return prompts, texts
+
+
+class wikipedia(PromptReader):
+    """
+    dataset for wikipedia with arguments configured for convenience
+
+    command line usage: `--train-data wikipedia`
+    """
+    # PATH = '/dataset/data/wiki.txt'
+    PATH = '/root/data/bert_data/wiki.txt'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data['text']
+        prompt, text = self.process_sample('', tokenizer,
+                                           tokenize), self.process_sample(
+                                               text, tokenizer, tokenize)
+        return [prompt], [text]
+
+
+class TestDataset(PromptReader):
+    PATH = '/root/data/test.json'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        prompt, text = data['prompt'], data['text']
+        prompt, text = self.process_sample(prompt, tokenizer,
+                                           tokenize), self.process_sample(
+                                               text, tokenizer, tokenize)
+        return [prompt], [text]
+
+
+class OpenWebText(PromptReader):
+    PATH = '/dataset/fd5061f6/english_data/openwebtext2'
+    assert_str = 'make sure to set PATH for openwebtext data_utils/corpora.py'
+
+    def __init__(self, *args, **kwargs):
+        import fasttext
+        super().__init__(*args, **kwargs)
+        self.model = fasttext.load_model(
+            '/dataset/fd5061f6/english_data/lid.176.bin')
+        print_rank_0('Load language detection model')
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data['text']
+        if len(text) > 100:
+            lang = self.model.predict(text.replace('\n', ''))[0][0]
+            if lang == '__label__en':
+                prompt, text = self.process_sample(
+                    '', tokenizer,
+                    tokenize), self.process_sample(text, tokenizer, tokenize)
+                return [prompt], [text]
+        return [], []
+
+
+class CCNews(PromptReader):
+    PATH = '/mnt/cc_news.json'
+    assert_str = 'make sure to set PATH for cc-news data_utils/corpora.py'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = ''
+        title = data.get('title', None)
+        description = data.get('description', None)
+        maintext = data.get('maintext', None)
+        if title:
+            text += title.strip() + ' '
+        if description and (not maintext
+                            or not maintext.startswith(description)):
+            text += description.strip() + ' '
+        if maintext:
+            text += maintext
+        if len(text) > 100:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class BertData(PromptReader):
+    is_json = False
+    PATH = '/dataset/fd5061f6/english_data/wikibook'
+
+    def process_line(self, data, tokenizer, tokenize):
+        if data:
+            prompt, text = '', data
+            prompt, text = self.process_sample(prompt, tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class Pile(PromptReader):
+    is_json = True
+    PATH = '/mnt/train'
+    filtered_sources = [
+        'Github', 'StackExchange', 'DM Mathematics', 'Ubuntu IRC', 'EuroParl',
+        'YoutubeSubtitles', 'Enron Emails'
+    ]
+    downsample_sources = {'PubMed Central': 0.3, 'ArXiv': 0.3, 'FreeLaw': 0.3}
+
+    def print_info(self, info):
+        total_dict = defaultdict(int)
+        while True:
+            try:
+                source_dict = info.get(block=False)
+                for source, length in source_dict.items():
+                    total_dict[source] += length
+            except Empty:
+                break
+        print_rank_0(total_dict)
+
+    def tokenize_worker(self, input, output, info, tokenizer, tokenize):
+        source_dict = defaultdict(int)
+        for row in iter(input.get, 'STOP'):
+            row = row.rstrip()
+            if row:
+                if self.is_json:
+                    row = json.loads(row)
+                prompts, texts, source = self.process_line(
+                    row, tokenizer, tokenize)
+                length = 0
+                for prompt, text in zip(prompts, texts):
+                    length += len(text)
+                    output.put((prompt, text))
+                if source:
+                    source_dict[source] += length
+        output.put('COMPLETE')
+        info.put(source_dict)
+
+    def process_line(self, data, tokenizer, tokenize):
+        source = data['meta'].get('pile_set_name', None)
+        text = data.get('text', None)
+        if source and text:
+            if source in self.filtered_sources:
+                return [], [], None
+            elif source in self.downsample_sources and random.random(
+            ) > self.downsample_sources[source]:
+                return [], [], None
+            else:
+                prompt, text = self.process_sample(
+                    '', tokenizer,
+                    tokenize), self.process_sample(text, tokenizer, tokenize)
+                return [prompt], [text], source
+        else:
+            return [], [], None
+
+
+class Stories(PromptReader):
+    is_json = True
+    PATH = '/dataset/fd5061f6/english_data/stories_31G.jsonl'
+
+    def process_line(self, data, tokenizer, tokenize):
+        text = data.get('text', None)
+        if text:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            return [prompt], [text]
+        else:
+            return [], []
+
+
+class BertBaseData(BertData):
+    PATH = '/root/data/formatted_one_article_per_line'
+
+
+class BertLargeData(BertData):
+    PATH = '/dataset/c07bd62b/cognitive/zhengxiao/formatted_one_article_per_line_large'
+
+
+class WuDaoCorpus(PromptReader):
+    # PATH = "/dataset/fd5061f6/chinese_data/WuDao"
+    PATH = '/wudao'
+    is_json = False
+    reserve_punct = True
+    split_row = False
+
+    def process_line(self, item, tokenizer, tokenize):
+        prompts, texts = [], []
+        text = ''
+        title = item.get('title', None)
+        content = item.get('content', None)
+        if title:
+            text += title.strip() + ' '
+        if content:
+            text += content
+        if len(text) > 100:
+            prompt, text = self.process_sample('', tokenizer,
+                                               tokenize), self.process_sample(
+                                                   text, tokenizer, tokenize)
+            prompts.append(prompt)
+            texts.append(text)
+        return prompts, texts
+
+
+NAMED_CORPORA = {
+    'wikipedia': wikipedia,
+    'wikipedia-key': KeyReader,
+    'openwebtext': OpenWebText,
+    'zhihu': zhihu,
+    'zhidao': zhidao,
+    'baike': baike,
+    'test': TestDataset,
+    'wikibook': BertData,
+    'bert-base': BertBaseData,
+    'bert-large': BertLargeData,
+    'cc-news': CCNews,
+    'pile': Pile,
+    'stories': Stories,
+    'wudao': WuDaoCorpus
+}
--- a/modelscope/models/nlp/mglm/data_utils/datasets.py
+++ b/modelscope/models/nlp/mglm/data_utils/datasets.py
--- a/modelscope/models/nlp/mglm/data_utils/extraction.py
+++ b/modelscope/models/nlp/mglm/data_utils/extraction.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import glob
+import os
+
+import json
+import nltk
+
+nltk.download('punkt')
+
+
+class NLTKSegmenter:
+
+    def __init(self):
+        pass
+
+    @staticmethod
+    def segment_string(article):
+        return nltk.tokenize.sent_tokenize(article)
+
+
+wiki_path = 'data/extracted'
+output_path = 'formatted/wiki-key.txt'
+segmenter = NLTKSegmenter()
+with open(output_path, 'w') as output:
+    for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
+        for filename in glob.glob(
+                os.path.join(dirname, 'wiki_*'), recursive=True):
+            print(filename)
+            article_lines = []
+            article_open = False
+            with open(filename, mode='r', newline='\n') as file:
+                for line in file:
+                    line = line.rstrip()
+                    if '<doc id=' in line:
+                        article_open = True
+                    elif '</doc>' in line:
+                        key_sentences, contents = [], []
+                        key, content = None, []
+                        for sentences in article_lines[1:]:
+                            if len(sentences) > 1:
+                                if key:
+                                    if len(content) > 0 or len(contents) == 0:
+                                        key_sentences.append(key)
+                                        contents.append(content)
+                                    else:
+                                        contents[-1].append(key)
+                                    key, content = None, []
+                                key_sentences.append(sentences[0])
+                                contents.append(sentences[1:])
+                            elif len(sentences) > 0:
+                                if key:
+                                    content.append(sentences[0])
+                                else:
+                                    key = sentences[0]
+                        if key:
+                            if len(content) > 0 or len(contents) == 0:
+                                key_sentences.append(key)
+                                contents.append(content)
+                            else:
+                                contents[-1].append(key)
+                        contents = [' '.join(content) for content in contents]
+                        article = {'key': key_sentences, 'content': contents}
+                        output.write(json.dumps(article))
+                        output.write('\n')
+                        article_open = False
+                        article_lines = []
+                    else:
+                        if article_open and line:
+                            sentences = segmenter.segment_string(line)
+                            article_lines.append(sentences)
--- a/modelscope/models/nlp/mglm/data_utils/file_utils.py
+++ b/modelscope/models/nlp/mglm/data_utils/file_utils.py
@@ -0,0 +1,256 @@
+# Modified by Zhipu.AI
+# This file is provided as is from:
+#   https://github.com/huggingface/pytorch-pretrained-BERT
+# Please refer to their repository for copyright.
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import os
+import shutil
+import sys
+import tempfile
+from functools import wraps
+from hashlib import sha256
+from io import open
+from urllib.parse import urlparse
+
+import boto3
+import json
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                  Path.home() / '.pytorch_pretrained_bert'))
+except (AttributeError, ImportError):
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+        'PYTORCH_PRETRAINED_BERT_CACHE',
+        os.path.join(os.path.expanduser('~'), '.pytorch_pretrained_bert'))
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError('file {} not found'.format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError('file {} not found'.format(meta_path))
+
+    with open(meta_path, encoding='utf-8') as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError('file {} not found'.format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError(
+            'unable to parse {} as a URL or as a local path'.format(
+                url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError('bad s3 path {}'.format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith('/'):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response['Error']['Code']) == 404:
+                raise EnvironmentError('file {} not found'.format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource('s3')
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource('s3')
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit='B', total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith('s3://'):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError(
+                'HEAD request failed for url {} with status code {}'.format(
+                    url, response.status_code))
+        etag = response.headers.get('ETag')
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info('%s not found in cache, downloading to %s', url,
+                        temp_file.name)
+
+            # GET file object
+            if url.startswith('s3://'):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info('copying %s to cache at %s', temp_file.name,
+                        cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info('creating metadata file for %s', cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding='utf-8') as meta_file:
+                json.dump(meta, meta_file)
+
+            logger.info('removing temp file %s', temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
--- a/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
+++ b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for loading text from disk"""
+import mmap
+import os
+import pickle as pkl
+import time
+from itertools import accumulate
+
+import numpy as np
+import torch
+from torch.multiprocessing import Lock
+
+
+def get_lazy_path(path):
+    """
+    Gets directory path where lazy files are stored.
+    """
+    return os.path.splitext(path)[0] + '.lazy'
+
+
+def exists_lazy(path, data_type='data'):
+    """
+    Check if we've already made a lazy version of this file for the `data_type` field.
+    """
+    if not os.path.exists(get_lazy_path(path)):
+        return False
+    contents = os.listdir(get_lazy_path(path))
+    if data_type not in contents:
+        return False
+    if data_type + '.len.pkl' not in contents:
+        return False
+    return True
+
+
+def get_scatter_path(path, scatter_rank):
+    path = os.path.splitext(path)[0] + '.scatter'
+    scatter_path = os.path.join(path, str(scatter_rank))
+    return scatter_path
+
+
+def exists_scatter(path, scatter_num=64, data_type='data'):
+    for i in range(scatter_num):
+        scatter_path = get_scatter_path(path, scatter_rank=i)
+        if not exists_lazy(scatter_path, data_type=data_type):
+            return False
+    return True
+
+
+class LazyWriter:
+
+    def __init__(self,
+                 path,
+                 data_type,
+                 is_array=False,
+                 array_data_type=np.int32):
+        lazypath = get_lazy_path(path)
+        if not os.path.exists(lazypath):
+            os.makedirs(lazypath)
+        self.datapath = os.path.join(lazypath, data_type)
+        self.lenpath = os.path.join(lazypath, data_type + '.len.pkl')
+        self.array_data_type = array_data_type
+        self.output = open(self.datapath, 'wb')
+        self.lengths = []
+        self.is_array = is_array
+
+    @staticmethod
+    def get_len_path(path, data_type):
+        lazypath = get_lazy_path(path)
+        return os.path.join(lazypath, data_type + '.len.pkl')
+
+    def write(self, s):
+        if isinstance(s, dict):
+            s = s['text']
+        if self.is_array:
+            encoded = np.array(
+                s, dtype=self.array_data_type).tobytes(order='C')
+            self.output.write(encoded)
+            self.lengths.append(len(s))
+        else:
+            encoded = s.encode('utf-8')
+            self.output.write(encoded)
+            self.lengths.append(len(encoded))
+
+    def close(self):
+        self.output.close()
+        with open(self.lenpath, 'wb') as f:
+            pkl.dump(self.lengths, f)
+
+
+def split_strings(strings, start, chr_lens):
+    """
+    Split strings based on string lengths and given start.
+    """
+    return [
+        strings[i - start:j - start]
+        for i, j in zip([start] + chr_lens[:-1], chr_lens)
+    ]
+
+
+class ProcessorTokenizer:
+    """
+    callable class that runs a preprocessing, as well as tokenization step,
+    on input text.
+    """
+
+    def __init__(self, tokenizer, process_fn=None):
+        self.tokenizer = tokenizer
+        self.process_fn = process_fn
+
+    def __call__(self, string):
+        if self.tokenizer is not None:
+            string = self.tokenizer(string, process_fn=self.process_fn)
+        elif self.process_fn is not None:
+            string = self.process_fn(string)
+        return string
+
+
+class LazyLoader(object):
+    """
+    Arguments:
+        path: path to directory where array entries are concatenated into one big string file
+            and the .len file are located
+        data_type (str): Some datsets have multiple fields that are stored in different paths.
+            `data_type` specifies which of these fields to load in this class
+        mem_map  (boolean): Specifies whether to memory map file `path`
+        map_fn (callable): Fetched strings are passed through map_fn before being returned.
+
+    Example of lazy loader directory structure:
+    file.json
+    file.lazy/
+        data_type1
+        data_type1.len.pkl
+        data_type2
+        data_type2.len.pkl
+    """
+
+    def __init__(self,
+                 path,
+                 data_type='data',
+                 mem_map=False,
+                 map_fn=None,
+                 is_array=False,
+                 array_data_type=np.int32,
+                 load_memory=False,
+                 half_load=False):
+        lazypath = get_lazy_path(path)
+        datapath = os.path.join(lazypath, data_type)
+        # get file where array entries are concatenated into one big string
+        self._file = open(datapath, 'rb')
+        self.file = self._file
+        self.is_array = is_array
+        self.array_data_type = array_data_type
+        # memory map file if necessary
+        lenpath = os.path.join(lazypath, data_type + '.len.pkl')
+        self.lens = pkl.load(open(lenpath, 'rb'))
+        if half_load:
+            self.lens = self.lens[:2 * len(self.lens) // 3]
+        self.ends = list(accumulate(self.lens))
+        self.dumb_ends = list(self.ends)
+        self.mem_map = mem_map
+        self.load_memory = load_memory
+        if self.load_memory:
+            data_type_size = np.dtype(self.array_data_type).itemsize
+            if half_load:
+                self.file = self.file.read(sum(self.lens) * data_type_size)
+            else:
+                self.file = self.file.read()
+            self.file = np.ndarray(
+                shape=(len(self.file) // data_type_size, ),
+                dtype=array_data_type,
+                buffer=self.file,
+                order='C')
+        elif self.mem_map:
+            if is_array:
+                if self.ends[-1] == 0:
+                    self.file = np.array([], dtype=array_data_type)
+                else:
+                    self.file = np.memmap(
+                        self.file, dtype=array_data_type, mode='r', order='C')
+            else:
+                if self.ends[-1] == 0:
+                    self.file = bytearray()
+                else:
+                    self.file = mmap.mmap(
+                        self.file.fileno(), 0, prot=mmap.PROT_READ)
+        self.read_lock = Lock()
+        self.process_fn = map_fn
+        self.map_fn = map_fn
+        self._tokenizer = None
+        self.is_lazy = True
+
+    def SetTokenizer(self, tokenizer):
+        """
+        logic to set and remove (set to None) tokenizer.
+        combines preprocessing/tokenization into one callable.
+        """
+        if tokenizer is None:
+            if not hasattr(self, '_tokenizer'):
+                self._tokenizer = tokenizer
+        else:
+            self._tokenizer = tokenizer
+        self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
+
+    def GetTokenizer(self):
+        return self._tokenizer
+
+    def __getitem__(self, index):
+        """
+        read file and splice strings based on string ending array `self.ends`
+        """
+        if not isinstance(index, slice):
+            if index == 0:
+                start = 0
+            else:
+                start = self.ends[index - 1]
+            end = self.ends[index]
+            rtn = self.file_read(start, end)
+            if self.map_fn is not None:
+                rtn = self.map_fn(rtn)
+        else:
+            # if slice, fetch strings with 1 diskread and then splice in memory
+            chr_lens = self.ends[index]
+            if index.start == 0 or index.start is None:
+                start = 0
+            else:
+                start = self.ends[index.start - 1]
+            stop = chr_lens[-1]
+            strings = self.file_read(start, stop)
+            rtn = split_strings(strings, start, chr_lens)
+            if self.map_fn is not None:
+                rtn = [self.map_fn(s) for s in rtn]
+        return rtn
+
+    def __len__(self):
+        return len(self.ends)
+
+    def file_read(self, start=0, end=None):
+        """read specified portion of file"""
+        data_type_size = np.dtype(self.array_data_type).itemsize
+        # atomic reads to avoid race conditions with multiprocess dataloader
+        self.read_lock.acquire()
+        if not self.mem_map and not self.load_memory:
+            # seek to start of file read
+            if self.is_array:
+                start = start * data_type_size
+                end = end * data_type_size if end is not None else None
+            self.file.seek(start)
+            # read to end of file if no end point provided
+            if end is None:
+                rtn = self.file.read()
+            # else read amount needed to reach end point
+            else:
+                rtn = self.file.read(end - start)
+            if self.is_array:
+                rtn = np.ndarray(
+                    shape=(len(rtn) // data_type_size, ),
+                    dtype=self.array_data_type,
+                    buffer=rtn,
+                    order='C')
+            else:
+                rtn = rtn.decode('utf-8', 'ignore')
+        else:
+            rtn = self.file[start:end]
+            if self.is_array:
+                rtn = rtn.copy()
+            else:
+                rtn = rtn.decode('utf-8', 'strict')
+        self.read_lock.release()
+        # TODO: @raulp figure out mem map byte string bug
+        # if mem map'd need to decode byte string to string
+        # # rtn = str(rtn)
+        # if self.mem_map:
+        #     rtn = rtn.decode('unicode_escape')
+        return rtn
--- a/modelscope/models/nlp/mglm/data_utils/samplers.py
+++ b/modelscope/models/nlp/mglm/data_utils/samplers.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""batch samplers that work with either random or sequential data samplers"""
+import math
+import os
+import sys
+
+import numpy as np
+import torch
+from torch.utils import data
+
+
+class RandomSampler(data.sampler.Sampler):
+    r"""
+    Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
+    but this class lets the user set an epoch like DistributedSampler
+    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify ``num_samples`` to draw.
+    Arguments:
+        data_source (Dataset): dataset to sample from
+        num_samples (int): number of samples to draw, default=len(dataset)
+        replacement (bool): samples are drawn with replacement if ``True``, default=False
+    """
+
+    def __init__(self, data_source, replacement=False, num_samples=None):
+        super(RandomSampler, self).__init__(data_source)
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.epoch = -1
+
+        if self._num_samples is not None and replacement is False:
+            raise ValueError(
+                'With replacement=False, num_samples should not be specified, '
+                'since a random permute will be performed.')
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError('num_samples should be a positive integer '
+                             'value, but got num_samples={}'.format(
+                                 self.num_samples))
+        if not isinstance(self.replacement, bool):
+            raise ValueError('replacement should be a boolean value, but got '
+                             'replacement={}'.format(self.replacement))
+
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        g = torch.Generator()
+        if self.epoch >= 0:
+            g.manual_seed(self.epoch)
+        if self.replacement:
+            for _ in range(self.num_samples // 32):
+                yield from torch.randint(
+                    high=n, size=(32, ), dtype=torch.int64,
+                    generator=g).tolist()
+            yield from torch.randint(
+                high=n,
+                size=(self.num_samples % 32, ),
+                dtype=torch.int64,
+                generator=g).tolist()
+        else:
+            yield from torch.randperm(n, generator=self.generator).tolist()
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class DistributedSequentialSampler(data.sampler.Sampler):
+
+    def __init__(self,
+                 num_samples,
+                 train_iters,
+                 batch_size,
+                 rank=-1,
+                 world_size=2):
+        super().__init__(num_samples)
+        if rank == -1:
+            rank = 0
+            world_size = 1
+        self.num_samples = num_samples
+        self.rank = rank
+        self.world_size = world_size
+        self.start_iter = 0
+        self.train_iters = train_iters
+        self.batch_size = batch_size
+        self.batch_bias = [
+            i * (num_samples // batch_size) for i in range(batch_size)
+        ]
+
+    def __iter__(self):
+        for idx in range(self.start_iter, self.train_iters * 10):
+            batch = [(idx + bias) % self.num_samples
+                     for bias in self.batch_bias]
+            tbatch = self._batch(batch)
+            yield tbatch
+
+    def __len__(self):
+        return self.train_iters
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
+
+
+class DistributedBatchSampler(data.sampler.BatchSampler):
+    """
+    similar to normal implementation of distributed sampler, except implementation is at the
+    batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
+    data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
+    """
+
+    def __init__(self,
+                 sampler,
+                 batch_size,
+                 drop_last,
+                 rank=-1,
+                 world_size=2,
+                 wrap_last=False,
+                 gradient_accumulation_steps=None):
+        super(DistributedBatchSampler, self).__init__(sampler, batch_size,
+                                                      drop_last)
+        if rank == -1:
+            assert False, 'should not be here'
+        self.rank = rank
+        self.world_size = world_size
+        self.sampler.wrap_around = 0
+        self.wrap_around = 0
+        self.wrap_last = wrap_last
+        self.start_iter = 0
+        self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps  # noqa
+
+    def __iter__(self):
+        batch = []
+        i = 0
+        for idx in self.data_iterator(self.sampler, wrap_around=False):
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                tbatch = self._batch(batch)
+                if i >= self.start_iter * self.effective_batch_size:
+                    yield tbatch
+                    self.start_iter = 0
+                i += len(batch)
+                batch = []
+        batch_len = len(batch)
+        if batch_len > 0 and not self.drop_last:
+            if self.wrap_last:
+                self.sampler.wrap_around -= (self.batch_size)
+                self.wrap_around += (len(batch))
+                self.wrap_around %= self.batch_size
+            yield self._batch(batch)
+        if self.wrap_last:
+            self.sampler.wrap_around += self.batch_size
+
+    def data_iterator(self, _iter, wrap_around=False):
+        """iterates through data and handles wrap around"""
+        for i, idx in enumerate(_iter):
+            if i < self.wrap_around % self.batch_size:
+                continue
+            if wrap_around:
+                self.wrap_around += 1
+                self.wrap_around %= self.batch_size
+            yield idx
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
--- a/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py
+++ b/modelscope/models/nlp/mglm/data_utils/sp_tokenizer.py
@@ -0,0 +1,158 @@
+# Modified by Zhipu.AI
+"""
+from https://github.com/openai/gpt-2/, changed for chinese
+"""
+import os  # yapf: disable
+
+
+"""
+SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation
+systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements
+subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the
+extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end
+system that does not depend on language-specific pre/postprocessing.
+https://github.com/google/sentencepiece
+
+pip install sentencepiece
+
+or  git clone https://github.com/google/sentencepiece.git
+python setup.py install
+
+"""
+
+
+def get_pairs(word):
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class Encoder:
+
+    def __init__(self, encoder, bpe_merges):
+        self.encoder = encoder
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.max_len = 0
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        return [self.encoder.get(token, 1) for token in self.tokenize(text)]
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        return text
+
+    def tokenize(self, text):
+        bpe_tokens = []
+        bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.encoder.get(token, 1) for token in tokens]
+
+
+class Encoder_SP:
+
+    def __init__(self, model_path):
+        import sentencepiece as spm
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+
+    def encode(self, text):
+        """
+        text="...."
+        """
+        return self.sp.EncodeAsIds(text)
+
+    def decode(self, tokens):
+        """
+        tokens=[x1,x2,...]
+        """
+        text = [int(token) for token in tokens]
+        # print(text)
+        return self.sp.DecodeIds(text)
+
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+
+
+def get_encoder(encoder_file, bpe_file):
+    import json
+    filepath, filename = os.path.split(encoder_file)
+    shotname, extension = os.path.splitext(filename)
+
+    if ('.model' == extension) and (bpe_file == ''):
+        return Encoder_SP(encoder_file)
+    else:
+        with open(encoder_file, 'r', encoding='utf-8') as f:
+            encoder = json.load(f)
+        with open(bpe_file, 'r', encoding='utf-8') as f:
+            bpe_data = f.read()
+        bpe_merges = [
+            tuple(merge_str.split())
+            for merge_str in bpe_data.split('\n')[1:-1]
+        ]
+        return Encoder(
+            encoder=encoder,
+            bpe_merges=bpe_merges,
+        )
+
+
+def from_pretrained(model_path):
+    return get_encoder(model_path + '/tokenizer/mglm250k/mglm250k-uni.model',
+                       '')
--- a/modelscope/models/nlp/mglm/data_utils/tokenization.py
+++ b/modelscope/models/nlp/mglm/data_utils/tokenization.py
--- a/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
+++ b/modelscope/models/nlp/mglm/data_utils/tokenization_gpt2.py
@@ -0,0 +1,359 @@
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import os
+import sys
+from io import open
+
+import json
+import regex as re
+
+from .file_utils import cached_path
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': '.pytorch_pretrained_bert/gpt2-vocab.json',
+    'roberta': '.pytorch_pretrained_bert/roberta-vocab.json'
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': '.pytorch_pretrained_bert/gpt2-merges.txt',
+    'roberta': '.pytorch_pretrained_bert/roberta-merges.txt'
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path,
+                                      VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path,
+                                       MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path,
+                                               SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info('loading special tokens file {}'.format(
+                    special_tokens_file))
+        # redirect to the cache, if necessary
+        # try:
+        #     resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        #     resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        # except EnvironmentError:
+        #     logger.error(
+        #         "Model name '{}' was not found in model name list ({}). "
+        #         "We assumed '{}' was a path or url but couldn't find files {} and {} "
+        #         "at this path or url.".format(
+        #             pretrained_model_name_or_path,
+        #             ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+        #             pretrained_model_name_or_path,
+        #             vocab_file, merges_file))
+        #     return None
+        # if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+        #     logger.info("loading vocabulary file {}".format(vocab_file))
+        #     logger.info("loading merges file {}".format(merges_file))
+        # else:
+        #     logger.info("loading vocabulary file {} from cache at {}".format(
+        #         vocab_file, resolved_vocab_file))
+        #     logger.info("loading merges file {} from cache at {}".format(
+        #         merges_file, resolved_merges_file))
+        resolved_vocab_file = vocab_file
+        resolved_merges_file = merges_file
+        logger.info('loading vocabulary file {}'.format(vocab_file))
+        logger.info('loading merges file {}'.format(merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(
+                special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs)
+        return tokenizer
+
+    def __init__(self,
+                 vocab_file,
+                 merges_file,
+                 errors='replace',
+                 special_tokens=None,
+                 max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        )
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i)
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {
+            v: k
+            for k, v in self.special_tokens.items()
+        }
+        logger.info('Special tokens {}'.format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b]
+                                for b in token.encode('utf-8'))
+            bpe_tokens.extend(
+                bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2
+                                       and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                'Token indices sequence length is longer than the specified maximum '
+                ' sequence length for this OpenAI GPT model ({} > {}). Running this'
+                ' sequence through the model will result in indexing errors'.
+                format(len(ids), self.max_len))
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error('Vocabulary path ({}) should be a directory'.format(
+                vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, 'w', encoding='utf-8') as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(
+                    self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        'Saving vocabulary to {}: BPE merge indices are not consecutive.'
+                        ' Please check that the tokenizer is not corrupted!'.
+                        format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(
+                    self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        'Saving special tokens vocabulary to {}: BPE indices are not consecutive.'
+                        ' Please check that the tokenizer is not corrupted!'.
+                        format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
--- a/modelscope/models/nlp/mglm/data_utils/wordpiece.py
+++ b/modelscope/models/nlp/mglm/data_utils/wordpiece.py
@@ -0,0 +1,408 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py"""  # noqa
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased':
+    '.pytorch_pretrained_bert/bert-base-uncased-vocab.txt',
+    'bert-large-uncased':
+    '.pytorch_pretrained_bert/bert-large-uncased-vocab.txt',
+    'bert-base-cased':
+    '.pytorch_pretrained_bert/bert-base-cased-vocab.txt',
+    'bert-large-cased':
+    '.pytorch_pretrained_bert/bert-large-cased-vocab.txt',
+    'bert-base-multilingual-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt',
+    'bert-base-multilingual-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt',
+    'bert-base-chinese':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt',
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=True,
+                 max_len=None,
+                 do_basic_tokenize=True,
+                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+                .format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([
+            (ids, tok) for tok, ids in self.vocab.items()
+        ])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        if self.do_basic_tokenize:
+            split_tokens = []
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            logger.warning(
+                'Token indices sequence length is longer than the specified maximum '
+                ' sequence length for this BERT model ({} > {}). Running this'
+                ' sequence through BERT will result in indexing errors'.format(
+                    len(ids), self.max_len))
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                'associated to this path or url.'.format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info('loading vocabulary file {}'.format(vocab_file))
+        else:
+            logger.info('loading vocabulary file {} from cache at {}'.format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  # noqa
+            (cp >= 0x3400 and cp <= 0x4DBF) or  # noqa
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  # noqa
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  # noqa
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  # noqa
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or  # noqa
+            (cp >= 0xF900 and cp <= 0xFAFF) or  # noqa
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  # noqa
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start > 0:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith('C'):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
+            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
--- a/modelscope/models/nlp/mglm/fp16/init.py
+++ b/modelscope/models/nlp/mglm/fp16/init.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fp16 import *  # noqa
+from .fp16util import (BN_convert_float, FP16Model, clip_grad_norm,
+                       convert_module, convert_network,
+                       master_params_to_model_params,
+                       model_grads_to_master_grads, network_to_half,
+                       prep_param_lists, to_python_float, tofp16)
+from .loss_scaler import *  # noqa
--- a/modelscope/models/nlp/mglm/fp16/fp16.py
+++ b/modelscope/models/nlp/mglm/fp16/fp16.py
@@ -0,0 +1,660 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Stable version of apex FP16 Optimizer"""
+import torch
+from torch import nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from .fp16util import (clip_grad_norm, master_params_to_model_params,
+                       model_grads_to_master_grads)
+from .loss_scaler import DynamicLossScaler, LossScaler
+
+FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, FLOAT_TYPES):
+            val = val.half()
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, HALF_TYPES):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class FP16_Module(nn.Module):
+
+    def __init__(self, module):
+        super(FP16_Module, self).__init__()
+        self.add_module('module', module.half())
+
+    def forward(self, *inputs, **kwargs):
+        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+
+# TODO:  Update overflow check + downscale to use Carl's fused kernel.
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
+    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
+    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
+    and changing the call to ``backward``.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary.
+
+    Args:
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
+        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
+        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
+        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
+
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
+    named to replace ``init_optimizer``, for two reasons:
+    First, it means that references to the same name
+    later in the file will not have to change.
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
+    modify ``init_optimizer``.  If you do choose a unique name for the new
+    :class:`FP16_Optimizer` instance, you should only work with this new instance,
+    because the preexisting optimizer might no longer behave as expected.
+
+    ``init_optimizer`` may be any Pytorch optimizer.
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
+    ingest these ``param_groups`` and remember them.
+
+    Calls to ::
+
+        loss.backward()
+
+    must be replaced with ::
+
+        optimizer.backward(loss)
+
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
+    loss scaling and copies to master gradients.
+
+    .. note::
+        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
+        are downscaled before being applied.  This means that adjusting the loss scale, or using
+        dynamic loss scaling, should not require retuning the learning rate or any other
+        hyperparameters.
+
+
+    **Advanced options**
+
+    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
+    See docstring for :attr:`step`.
+
+    **Gradient clipping**:  Use :attr:`clip_master_grads`.
+
+    **Multiple losses**:  If your model accumulates gradients from multiple losses,
+    this can be made more efficient by supplying ``update_master_grads=False``
+    to :attr:`backward`.  See docstring for :attr:`backward`.
+
+    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
+
+        print(optimizer.loss_scale)
+        optimizer.loss_scale = new_loss_scale
+
+    For static loss scaling, manually adjusting the loss scale over time is a reasonable
+    thing to do.  During later epochs, gradients may become smaller, and a
+    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
+    the loss scale is not recommended.
+
+    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
+    should still work as intended.
+    """ # noqa
+
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=False):
+        if not torch.cuda.is_available:
+            raise SystemError('Cannot use fp16 without CUDA.')
+
+        self.verbose = verbose
+
+        self.optimizer = init_optimizer
+        # init_state_dict sets up an alternative way to cast per-param state tensors.
+        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+        # init_state_dict = init_optimizer.state_dict()
+
+        self.fp16_groups = []
+        self.fp32_from_fp16_groups = []
+        self.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            self.maybe_print(
+                'FP16_Optimizer processing param group {}:'.format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.HalfTensor with {}'
+                            .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        # Copythe model parallel flag.
+                        master_param.model_parallel = param.model_parallel
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[
+                                master_param] = self.optimizer.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.FloatTensor with {}'
+                            .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError(
+                            'Wrapped parameters must be either '
+                            'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
+                            'Received {}'.format(param.type()))
+
+            self.fp16_groups.append(fp16_params_this_group)
+            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+        # alternative way to cast per-param state tensors:
+        # self.optimizer.load_state_dict(init_state_dict)
+
+        if dynamic_loss_scale:
+            self.dynamic_loss_scale = True
+            if dynamic_loss_args is not None:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+            else:
+                self.loss_scaler = DynamicLossScaler()
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(static_loss_scale)
+
+        self.overflow = False
+        self.first_closure_call_this_step = True
+
+        self.clip_grad_norm = clip_grad_norm
+
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
+
+    def __getstate__(self):
+        raise RuntimeError(
+            'FP16_Optimizer should be serialized using state_dict().')
+
+    def __setstate__(self, state):
+        raise RuntimeError(
+            'FP16_Optimizer should be deserialized using load_state_dict().')
+
+    def zero_grad(self, set_grads_to_None=False):
+        """
+        Zero fp32 and fp16 parameter grads.
+        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+        # Zero fp16 gradients owned by the model:
+        for fp16_group in self.fp16_groups:
+            for param in fp16_group:
+                if set_grads_to_None:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach_(
+                        )  # as in torch.optim.optimizer.zero_grad()
+                        param.grad.zero_()
+
+    def _check_overflow(self):
+        params = []
+        for group in self.fp16_groups:
+            for param in group:
+                params.append(param)
+        for group in self.fp32_from_fp32_groups:
+            for param in group:
+                params.append(param)
+        self.overflow = self.loss_scaler.has_overflow(params)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    def _master_params_to_model_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+    def _model_params_to_master_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
+
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    def _model_grads_to_master_grads(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+
+    def _downscale_master(self):
+        if self.loss_scale != 1.0:
+            for group in self.optimizer.param_groups:
+                for param in group['params']:
+                    if param.grad is not None:
+                        param.grad.data.mul_(1. / self.loss_scale)
+
+    def clip_master_grads(self, max_norm, norm_type=2):
+        """
+        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp32 gradients (viewed as a single vector).
+
+        .. warning::
+            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
+        """ # noqa
+        if not self.overflow:
+            fp32_params = []
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    fp32_params.append(param)
+            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+        else:
+            return -1
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict[
+            'first_closure_call_this_step'] = self.first_closure_call_this_step
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+
+        Example::
+
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        self.first_closure_call_this_step = state_dict[
+            'first_closure_call_this_step']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current_group, saved_group in zip(self.fp32_from_fp16_groups,
+                                              state_dict['fp32_from_fp16']):
+            for current, saved in zip(current_group, saved_group):
+                current.data.copy_(saved.data)
+
+    def step(self, closure=None):  # could add clip option.
+        """
+        If no closure is supplied, :attr:`step` should be called after
+        ``fp16_optimizer_obj.backward(loss)``.
+        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+        another forward pass using their model.
+
+        If a closure is supplied, :attr:`step` may be called without a prior call to
+        :attr:`backward(loss)`.
+        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+        However, the user should take care that any ``loss.backward()`` call within the closure
+        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+
+        Args:
+           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
+
+        Example with closure::
+
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
+            # existing pytorch optimizer.
+            for input, target in dataset:
+                def closure():
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                    # loss.backward() becomes:
+                    optimizer.backward(loss)
+                    return loss
+                optimizer.step(closure)
+
+        .. warning::
+            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+
+        .. _`ordinary Pytorch optimizer use`:
+            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+        """ # noqa
+
+        scale = self.loss_scaler.loss_scale
+        self._update_scale(self.overflow)
+
+        if self.overflow:
+            self.maybe_print(
+                'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
+                .format(scale, self.loss_scale))
+            return
+
+        if closure is not None:
+            retval = self._step_with_closure(closure)
+        else:
+            retval = self.optimizer.step()
+
+        self._master_params_to_model_params()
+
+        return retval
+
+    def _step_with_closure(self, closure):
+
+        def wrapped_closure():
+            # helpful for debugging
+            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+            #       .format(self.first_closure_call_this_step))
+            if self.first_closure_call_this_step:
+                # We expect that the fp16 params are initially fresh on entering self.step(),
+                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+                # is called within self.optimizer.step().
+                self.first_closure_call_this_step = False
+            else:
+                # If self.optimizer.step() internally calls wrapped_closure more than once,
+                # it may update the fp32 params after each call.  However, self.optimizer
+                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+                # to handle that manually:
+                self._master_params_to_model_params()
+            # Our API expects the user to give us ownership of the backward() call by
+            # replacing all calls to loss.backward() with optimizer.backward(loss).
+            # This requirement holds whether or not the call to backward() is made within a closure.
+            # If the user is properly calling optimizer.backward(loss) within "closure,"
+            # calling closure() here will give the fp32 master params fresh gradients
+            # for the optimizer to play with, so all wrapped_closure needs to do is call
+            # closure() and return the loss.
+            temp_loss = closure()
+            while (self.overflow):
+                scale = self.loss_scaler.loss_scale
+                self._update_scale(self.overflow)
+                self.maybe_print(
+                    'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
+                    'reducing to {}'.format(scale, self.loss_scale))
+                temp_loss = closure()
+            return temp_loss
+
+        retval = self.optimizer.step(wrapped_closure)
+
+        self.first_closure_call_this_step = True
+
+        return retval
+
+    def backward(self, loss, update_master_grads=True, retain_graph=False):
+        """
+        :attr:`backward` performs the following conceptual steps:
+
+        1. fp32_loss = loss.float() (see first Note below)
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
+        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
+        5. Finally, master grads are divided by loss_scale.
+
+        In this way, after :attr:`backward`, the master params have fresh gradients,
+        and :attr:`step` may be called.
+
+        .. note::
+            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+            This provides some additional safety against overflow if the user has supplied an
+            fp16 loss value.
+            However, for maximum overflow safety, the user should
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
+            :attr:`backward`.
+
+        .. warning::
+            The gradients found in a model's leaves after the call to
+            :attr:`backward` should not be regarded as valid in general,
+            because it's possible
+            they have been scaled (and in the case of dynamic loss scaling,
+            the scale factor may change over time).
+            If the user wants to inspect gradients after a call to :attr:`backward`,
+            only the master gradients should be regarded as valid.  These can be retrieved via
+            :attr:`inspect_master_grad_data()`.
+
+        Args:
+            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
+            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
+
+        Example::
+
+            # Ordinary operation:
+            optimizer.backward(loss)
+
+            # Naive operation with multiple losses (technically valid, but less efficient):
+            # fp32 grads will be correct after the second call,  but
+            # the first call incurs an unnecessary fp16->fp32 grad copy.
+            optimizer.backward(loss1)
+            optimizer.backward(loss2)
+
+            # More efficient way to handle multiple losses:
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all
+            # losses have been accumulated.
+            optimizer.backward(loss1, update_master_grads=False)
+            optimizer.backward(loss2, update_master_grads=False)
+            optimizer.update_master_grads()
+        """ # noqa
+        # To consider:  try multiple backward passes using retain_grad=True to find
+        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if update_master_grads:
+            self.update_master_grads()
+
+    def update_master_grads(self):
+        """
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to
+        the ``.grad`` attribute of the fp32 master parameters that are directly
+        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+        """ # noqa
+        if self.dynamic_loss_scale:
+            self._check_overflow()
+            if self.overflow: return  # noqa
+        self._model_grads_to_master_grads()
+        self._downscale_master()
+
+    def inspect_master_grad_data(self):
+        """
+        When running with :class:`FP16_Optimizer`,
+        ``.grad`` attributes of a model's fp16 leaves should not be
+        regarded as truthful, because they might be scaled.
+        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+        the fp32 master params' ``.grad``
+        attributes will contain valid gradients properly divided by the loss scale.  However,
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
+        nonintuitive.  :attr:`inspect_master_grad_data`
+        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+
+        Returns:
+            List of lists (one list for each parameter group).  The list for each parameter group
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
+        """
+        if self.overflow:
+            print(
+                'Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  '
+                'Gradients are currently invalid (may be inf, nan, or stale).  Returning None.'
+            )
+            return None
+        else:
+            # The optimizer owns only references to master params.
+            master_grads_data = []
+            for param_group in self.optimizer.param_groups:
+                master_grads_this_group = []
+                for param in param_group['params']:
+                    if param.grad is not None:
+                        master_grads_this_group.append(param.grad.data)
+                    else:
+                        master_grads_this_group.append(None)
+                master_grads_data.append(master_grads_this_group)
+            return master_grads_data
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
--- a/modelscope/models/nlp/mglm/fp16/fp16util.py
+++ b/modelscope/models/nlp/mglm/fp16/fp16util.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+
+from modelscope.models.nlp.mglm import mpu
+
+
+class tofp16(nn.Module):
+    """
+    Utility module that implements::
+
+        def forward(self, input):
+            return input.half()
+    """
+
+    def __init__(self):
+        super(tofp16, self).__init__()
+
+    def forward(self, input):
+        return input.half()
+
+
+def BN_convert_float(module):
+    """
+    Utility function for network_to_half().
+
+    Retained for legacy purposes.
+    """
+    if isinstance(
+            module,
+            torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+        module.float()
+    for child in module.children():
+        BN_convert_float(child)
+    return module
+
+
+def network_to_half(network):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+
+    Retained for legacy purposes. It is recommended to use FP16Model.
+    """
+    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
+
+
+def convert_module(module, dtype):
+    """
+    Converts a module's immediate parameters and buffers to dtype.
+    """
+    for param in module.parameters(recurse=False):
+        if param is not None:
+            if param.data.dtype.is_floating_point:
+                param.data = param.data.to(dtype=dtype)
+            if param._grad is not None and param._grad.data.dtype.is_floating_point:
+                param._grad.data = param._grad.data.to(dtype=dtype)
+
+    for buf in module.buffers(recurse=False):
+        if buf is not None and buf.data.dtype.is_floating_point:
+            buf.data = buf.data.to(dtype=dtype)
+
+
+def convert_network(network, dtype):
+    """
+    Converts a network's parameters and buffers to dtype.
+    """
+    for module in network.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
+                      ) and module.affine is True:
+            continue
+        convert_module(module, dtype)
+    return network
+
+
+class FP16Model(nn.Module):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+    """
+
+    def __init__(self, network):
+        super(FP16Model, self).__init__()
+        self.network = convert_network(network, dtype=torch.half)
+
+    def forward(self, *inputs):
+        inputs = tuple(t.half() for t in inputs)
+        return self.network(*inputs)
+
+
+def backwards_debug_hook(grad):
+    raise RuntimeError(
+        'master_params recieved a gradient in the backward pass!')
+
+
+def prep_param_lists(model, flat_master=False):
+    """
+    Creates a list of FP32 master parameters for a given model, as in
+    `Training Neural Networks with Mixed Precision:  Real Examples`_.
+
+    Args:
+        model (torch.nn.Module): Existing Pytorch model
+        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
+    Returns:
+        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
+
+    Example::
+
+        model_params, master_params = prep_param_lists(model)
+
+    .. warning::
+        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
+
+    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
+        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+    """ # noqa
+    model_params = [
+        param for param in model.parameters() if param.requires_grad
+    ]
+
+    if flat_master:
+        # Give the user some more useful error messages
+        try:
+            # flatten_dense_tensors returns a contiguous flat array.
+            # http://pytorch.org/docs/master/_modules/torch/_utils.html
+            master_params = _flatten_dense_tensors(
+                [param.data for param in model_params]).float()
+        except:  # noqa
+            print(
+                'Error in prep_param_lists:  model may contain a mixture of parameters '
+                'of different types.  Use flat_master=False, or use F16_Optimizer.'
+            )
+            raise
+        master_params = torch.nn.Parameter(master_params)
+        master_params.requires_grad = True
+        # master_params.register_hook(backwards_debug_hook)
+        if master_params.grad is None:
+            master_params.grad = master_params.new(*master_params.size())
+        return model_params, [master_params]
+    else:
+        master_params = [
+            param.clone().float().detach() for param in model_params
+        ]
+        for param in master_params:
+            param.requires_grad = True
+        return model_params, master_params
+
+
+def model_grads_to_master_grads(model_params,
+                                master_params,
+                                flat_master=False):
+    """
+    Copy model gradients to master gradients.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
+    """ # noqa
+    if flat_master:
+        # The flattening may incur one more deep copy than is necessary.
+        master_params[0].grad.data.copy_(
+            _flatten_dense_tensors([p.grad.data for p in model_params]))
+    else:
+        for model, master in zip(model_params, master_params):
+            if model.grad is not None:
+                if master.grad is None:
+                    master.grad = Variable(
+                        master.data.new(*master.data.size()))
+                master.grad.data.copy_(model.grad.data)
+            else:
+                master.grad = None
+
+
+def master_params_to_model_params(model_params,
+                                  master_params,
+                                  flat_master=False):
+    """
+    Copy master parameters to model parameters.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
+    """ # noqa
+    if flat_master:
+        for model, master in zip(
+                model_params,
+                _unflatten_dense_tensors(master_params[0].data, model_params)):
+            model.data.copy_(master)
+    else:
+        for model, master in zip(model_params, master_params):
+            model.data.copy_(master.data)
+
+
+# Backward compatibility fixes
+
+
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+clip_grad_norm = mpu.clip_grad_norm
--- a/modelscope/models/nlp/mglm/fp16/loss_scaler.py
+++ b/modelscope/models/nlp/mglm/fp16/loss_scaler.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from modelscope.models.nlp.mglm import mpu
+
+
+# item() is a recent addition, so this helps with backward compatibility.
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+class LossScaler:
+    """
+    Class that manages a static loss scale.  This class is intended to interact with
+    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
+
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
+    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
+    """
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+class DynamicLossScaler:
+    """
+    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
+    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
+    operates, because the default options can be changed using the
+    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
+
+    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
+    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
+    occurred.
+    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients detected,
+    :class:`DynamicLossScaler` increases the loss scale once more.
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
+    always using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
+    """ # noqa
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000,
+                 min_scale=1,
+                 delayed_shift=1,
+                 consecutive_hysteresis=False):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.min_scale = min_scale
+        self.delayed_shift = delayed_shift
+        self.cur_hysteresis = delayed_shift
+        self.consecutive_hysteresis = consecutive_hysteresis
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params):
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
+                    p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        torch.distributed.all_reduce(
+            overflow_gpu,
+            op=torch.distributed.ReduceOp.MAX,
+            group=mpu.get_model_parallel_group())
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float(
+                    'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    # `overflow` is boolean indicating whether the gradient overflowed
+    def update_scale(self, overflow):
+
+        if not hasattr(self, 'min_scale'):
+            self.min_scale = 1
+        if not hasattr(self, 'delayed_shift'):
+            self.delayed_shift = 1
+        if not hasattr(self, 'cur_hysteresis'):
+            self.cur_hysteresis = 1
+        if not hasattr(self, 'consecutive_hysteresis'):
+            self.consecutive_hysteresis = True
+        if overflow:
+            # self.cur_scale /= self.scale_factor
+            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
+                self.cur_scale = max(self.cur_scale / self.scale_factor,
+                                     self.min_scale)
+            else:
+                self.cur_hysteresis -= 1
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if self.consecutive_hysteresis:
+                self.cur_hysteresis = self.delayed_shift
+            if (self.cur_iter
+                    - self.last_overflow_iter) % self.scale_window == 0:
+                if not self.consecutive_hysteresis:
+                    self.cur_hysteresis = self.delayed_shift
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+##############################################################
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+"""
+TO-DO separate out into an example.
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
+"""
--- a/modelscope/models/nlp/mglm/generation_utils.py
+++ b/modelscope/models/nlp/mglm/generation_utils.py
@@ -0,0 +1,483 @@
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from collections import UserDict
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+
+PROCESS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses.
+        next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses.
+        next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond.
+        pad_token_id (:obj:`int`, `optional`):
+            The id of the `padding` token.
+        eos_token_id (:obj:`int`, `optional`):
+            The id of the `end-of-sequence` token.
+
+    Return:
+        :obj:`UserDict`: A dictionary composed of the fields as defined above:
+
+            - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated
+              scores of all non-finished beams.
+            - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens
+              to be added to the non-finished beam_hypotheses.
+            - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices
+              indicating to which beam the next tokens shall be added.
+
+"""
+
+FINALIZE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The final scores of all non-finished beams.
+        final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The last tokens to be added to the non-finished beam_hypotheses.
+        final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added.
+        pad_token_id (:obj:`int`, `optional`):
+            The id of the `padding` token.
+        eos_token_id (:obj:`int`, `optional`):
+            The id of the `end-of-sequence` token.
+
+    Return:
+        :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
+        sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
+        batches finished early due to the :obj:`eos_token_id`.
+
+"""
+
+
+class BeamScorer(ABC):
+    """
+    Abstract base class for all beam scorers that are used for :meth:`~transformers.PretrainedModel.beam_search` and
+    :meth:`~transformers.PretrainedModel.beam_sample`.
+    """
+
+    @abstractmethod
+    def process(self, input_ids: torch.LongTensor,
+                next_scores: torch.FloatTensor, next_tokens: torch.LongTensor,
+                next_indices: torch.LongTensor,
+                **kwargs) -> Tuple[torch.Tensor]:
+        raise NotImplementedError('This is an abstract method.')
+
+    @abstractmethod
+    def finalize(self, input_ids: torch.LongTensor,
+                 next_scores: torch.FloatTensor, next_tokens: torch.LongTensor,
+                 next_indices: torch.LongTensor, **kwargs) -> torch.LongTensor:
+        raise NotImplementedError('This is an abstract method.')
+
+
+class BeamSearchScorer(BeamScorer):
+    r"""
+    :class:`transformers.BeamScorer` implementing standard beam search decoding.
+
+    Adapted in part from `Facebook's XLM beam search code
+    <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
+
+    Args:
+        batch_size (:obj:`int`):
+            Batch Size of :obj:`input_ids` for which beam search decoding is run in parallel.
+        max_length (:obj:`int`):
+            The maximum length of the sequence to be generated.
+        num_beams (:obj:`int`):
+            Number of beams for beam search.
+        device (:obj:`torch.device`):
+            Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of
+            :obj:`BeamSearchScorer` will be allocated.
+        length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+            Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
+            model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
+            sequences.
+        do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
+        num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1):
+            The number of beam hypotheses that shall be returned upon calling
+            :meth:`~transformer.BeamSearchScorer.finalize`.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        max_length: int,
+        num_beams: int,
+        device: torch.device,
+        length_penalty: Optional[float] = 1.0,
+        do_early_stopping: Optional[bool] = False,
+        num_beam_hyps_to_keep: Optional[int] = 1,
+    ):
+        self.max_length = max_length
+        self.num_beams = num_beams
+        self.device = device
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams,
+                max_length=self.max_length,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping,
+            ) for _ in range(batch_size)
+        ]
+        self._done = torch.tensor([False for _ in range(batch_size)],
+                                  dtype=torch.bool,
+                                  device=self.device)
+
+        # if not isinstance(num_beams, int) or num_beams <= 1:
+        #     raise ValueError(
+        #     )
+
+    @property
+    def is_done(self) -> bool:
+        return self._done.all()
+
+    def process(self,
+                input_ids: torch.LongTensor,
+                next_scores: torch.FloatTensor,
+                next_tokens: torch.LongTensor,
+                next_indices: torch.LongTensor,
+                pad_token_id: Optional[int] = None,
+                eos_token_id: Optional[int] = None,
+                mems=None) -> Tuple[torch.Tensor]:
+        cur_len = input_ids.shape[-1]
+        batch_size = len(self._beam_hyps)
+        assert batch_size == (input_ids.shape[0] // self.num_beams)
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        device = next_scores.device
+        next_beam_scores = torch.zeros((batch_size, self.num_beams),
+                                       dtype=next_scores.dtype,
+                                       device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.num_beams),
+                                       dtype=next_tokens.dtype,
+                                       device=device)
+        next_beam_indices = torch.zeros((batch_size, self.num_beams),
+                                        dtype=next_indices.dtype,
+                                        device=device)
+
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                assert (
+                    len(beam_hyp) >= self.num_beams
+                ), 'Batch can only be done if at least {} beams have been generated'.format(
+                    self.num_beams)
+                assert (
+                    eos_token_id is not None and pad_token_id is not None
+                ), 'generated beams >= num_beams -> eos_token_id and pad_token have to be defined'
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score,
+                                  next_index) in enumerate(
+                                      zip(next_tokens[batch_idx],
+                                          next_scores[batch_idx],
+                                          next_indices[batch_idx])):
+                batch_beam_idx = batch_idx * self.num_beams + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item()
+                                                   in eos_token_id):
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.num_beams
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    beam_hyp.add(
+                        input_ids[batch_beam_idx].clone(),
+                        next_score.item(),
+                        mems=[mem[[next_index.item()]]
+                              for mem in mems] if mems else None)
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.num_beams:
+                    break
+
+            if beam_idx < self.num_beams:
+                raise ValueError(
+                    f'At most {self.num_beams} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected.'  # noqa
+                )  # noqa
+
+            # Check if we are done so that we can save a pad step if all(done)
+            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+                next_scores[batch_idx].max().item(), cur_len)
+
+        return UserDict({
+            'next_beam_scores': next_beam_scores.view(-1),
+            'next_beam_tokens': next_beam_tokens.view(-1),
+            'next_beam_indices': next_beam_indices.view(-1),
+        })
+
+    def finalize(self,
+                 input_ids: torch.LongTensor,
+                 final_beam_scores: torch.FloatTensor,
+                 final_beam_tokens: torch.LongTensor,
+                 final_beam_indices: torch.LongTensor,
+                 pad_token_id: Optional[int] = None,
+                 eos_token_id: Optional[int] = None,
+                 mems=None) -> Tuple[torch.LongTensor, List[torch.Tensor]]:
+        batch_size = len(self._beam_hyps)
+
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                continue
+
+            # need to add best num_beams hypotheses to generated hyps
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+                beam_hyp.add(
+                    final_tokens,
+                    final_score,
+                    mems=[mem[[batch_beam_idx]]
+                          for mem in mems] if mems else None)
+
+        # select the best hypotheses
+        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
+        best = []
+
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_hyp, mems = sorted_hyps.pop()[1:]
+                sent_lengths[self.num_beam_hyps_to_keep * i
+                             + j] = len(best_hyp)
+                best.append((best_hyp, mems))
+
+        # prepare for adding eos
+        sent_max_len = min(sent_lengths.max().item(), self.max_length)
+        decoded: torch.LongTensor = input_ids.new(
+            batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            assert pad_token_id is not None, '`pad_token_id` has to be defined'
+            decoded.fill_(pad_token_id)
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        mems = []
+        for i, (hypo, mem) in enumerate(best):
+            decoded[i, :sent_lengths[i]] = hypo
+            if sent_lengths[i] < sent_max_len:
+                decoded[i, sent_lengths[i]] = eos_token_id
+            mems.append(mem)
+        mems = [
+            torch.cat([mem[i] for mem in mems], dim=0)
+            for i in range(len(mems[0]))
+        ] if mems and mems[0] else None
+        return decoded, mems
+
+
+class BeamHypotheses:
+
+    def __init__(self, num_beams: int, max_length: int, length_penalty: float,
+                 early_stopping: bool):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp: torch.LongTensor, sum_logprobs: float, mems=None):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / (max(hyp.shape[-1], 1)**self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp, mems))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([
+                    (s, idx) for idx, (s, _, _) in enumerate(self.beams)
+                ])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
+        one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len**self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
+
+class LogitsProcessor(ABC):
+    """Abstract base class for all logit processors that can be applied during generation."""
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        """Torch method for processing logits."""
+        raise NotImplementedError(
+            f'{self.__class__} is an abstract class. Only classes inheriting this class can be called.'
+        )
+
+
+class LogitsProcessorList(list):
+    """
+    This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
+    :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
+    list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
+    :class:`~transformers.LogitsProcessor` to the inputs.
+    """
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        for processor in self:
+            scores = processor(input_ids, scores)
+        return scores
+
+
+class MinLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (:obj:`int`):
+            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
+        eos_token_id (:obj:`int`):
+            The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, min_length: int, eos_token_id: int):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(
+                f'`min_length` has to be a positive integer, but is {min_length}'
+            )
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(
+                f'`eos_token_id` has to be a positive integer, but is {eos_token_id}'
+            )
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        cur_len = input_ids.shape[-1]
+        if cur_len < self.min_length:
+            scores[:, self.eos_token_id] = -float('inf')
+        return scores
+
+
+class NoRepeatNGramLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq
+    <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__.
+
+    Args:
+        ngram_size (:obj:`int`):
+            All ngrams of size :obj:`ngram_size` can only occur once.
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(
+                f'`ngram_size` has to be a strictly positive integer, but is {ngram_size}'
+            )
+        self.ngram_size = ngram_size
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
+        num_batch_hypotheses = scores.shape[0]
+        cur_len = input_ids.shape[-1]
+        banned_batch_tokens = self._calc_banned_ngram_tokens(
+            input_ids, num_batch_hypotheses, cur_len)
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores[i, banned_tokens] = -float('inf')
+
+        return scores
+
+    def _calc_banned_ngram_tokens(self, prev_input_ids: torch.Tensor,
+                                  num_hypos: int,
+                                  cur_len: int) -> List[Iterable[int]]:
+        """Copied from fairseq for no_repeat_ngram in beam_search"""
+        if cur_len + 1 < self.ngram_size:
+            # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+            return [[] for _ in range(num_hypos)]
+        generated_ngrams = [{} for _ in range(num_hypos)]
+        for idx in range(num_hypos):
+            gen_tokens = prev_input_ids[idx].tolist()
+            generated_ngram = generated_ngrams[idx]
+            for ngram in zip(*[gen_tokens[i:]
+                               for i in range(self.ngram_size)]):
+                prev_ngram_tuple = tuple(ngram[:-1])
+                generated_ngram[prev_ngram_tuple] = generated_ngram.get(
+                    prev_ngram_tuple, []) + [ngram[-1]]
+
+        def _get_generated_ngrams(hypo_idx):
+            # Before decoding the next token, prevent decoding of ngrams that have already appeared
+            start_idx = cur_len + 1 - self.ngram_size
+            ngram_idx = tuple(prev_input_ids[hypo_idx,
+                                             start_idx:cur_len].tolist())
+            return generated_ngrams[hypo_idx].get(ngram_idx, [])
+
+        banned_tokens = [
+            _get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)
+        ]
+        return banned_tokens
--- a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
+++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os
+import random
+from os import path as osp
+from typing import Dict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from . import mpu
+from .arguments import get_args
+from .generation_utils import BeamSearchScorer
+from .train_utils import get_model
+from .utils import load_checkpoint
+
+__all__ = ['MGLMForTextSummarization']
+
+
+def setup_args(args):
+    args.block_lm = True
+    args.task_mask = True
+    args.cloze_eval = True
+    args.num_layers = 24
+    args.hidden_size = 1536
+    args.num_attention_heads = 16
+    args.max_position_embeddings = 1024
+    args.tokenizer_type = 'ChineseSPTokenizer'
+    args.load_pretrained = ''
+    args.DDP_impl = 'none'
+    args.model_parallel_size = 1
+    args.fp16 = True
+    args.cache_dir = 'cache'
+    args.out_seq_length = 200
+    args.seq_length = 512
+    args.temperature = 0.9
+    args.top_k = 2
+    args.top_p = 0.8
+    args.frequency_penalty = 0.1
+    args.presence_penalty = 0.1
+    args.mem_length = args.seq_length + args.mem_length - 1
+    return args
+
+
+def setup_model(args):
+    """Setup model and optimizer."""
+
+    model = get_model(args, model_type='generation')
+
+    if args.load_pretrained is not None:
+        args.no_load_optim = True
+        args.load = args.load_pretrained
+        _ = load_checkpoint(model, None, None, args)
+
+    return model
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def get_masks_and_position_ids(data,
+                               eod_token,
+                               reset_position_ids,
+                               reset_attention_mask,
+                               loss_mask=None,
+                               attention_mask=None,
+                               set_loss_mask=False,
+                               mem_length=None):
+    # Extract batch size and sequence length.
+    batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if mem_length:
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (1, seq_length, seq_length + mem_length), device=data.device)
+        attention_mask = torch.tril(
+            torch.triu(attention_mask, 1 - seq_length + mem_length),
+            mem_length)
+    else:
+        if reset_attention_mask:
+            att_mask_batch = batch_size
+        else:
+            att_mask_batch = 1
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (att_mask_batch, seq_length, seq_length), device=data.device)
+        attention_mask = torch.tril(attention_mask)
+    attention_mask = attention_mask.unsqueeze(1)
+
+    # Loss mask.
+    if loss_mask is None:
+        loss_mask = torch.ones(
+            data.size(), dtype=torch.float, device=data.device)
+
+    # Position ids.
+    position_ids = torch.arange(
+        seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    if set_loss_mask:
+        loss_mask[data == eod_token] = 0.0
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    return attention_mask, loss_mask, position_ids
+
+
+def initialize_distributed(args):
+    """Initialize torch.distributed."""
+
+    # Manually set the device ids.
+    device = args.rank % torch.cuda.device_count()
+    if args.local_rank is not None:
+        device = args.local_rank
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+    args.master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    args.master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += args.master_ip + ':' + args.master_port
+    torch.distributed.init_process_group(
+        backend=args.distributed_backend,
+        world_size=args.world_size,
+        rank=args.rank,
+        init_method=init_method)
+
+    # Set the model-parallel / data-parallel communicators.
+    mpu.initialize_model_parallel(args.model_parallel_size)
+
+    # Optional DeepSpeed Activation Checkpointing Features
+    #
+    if hasattr(
+            args, 'deepspeed'
+    ) and args.deepspeed and args.deepspeed_activation_checkpointing:
+        set_deepspeed_activation_checkpointing(args)
+
+
+def get_batch(context_tokens, device, args):
+    tokens = context_tokens
+    tokens = tokens.view(args.batch_size, -1).contiguous()
+    tokens = tokens.to(device)
+
+    # Get the masks and postition ids.
+    if args.block_lm:
+        attention_mask = torch.tensor([tokens.size(1)],
+                                      device=device,
+                                      dtype=torch.long)
+        position_ids = torch.arange(
+            tokens.size(1), device=device, dtype=torch.long)
+        if not args.no_block_position:
+            block_position_ids = torch.zeros(
+                tokens.size(1), device=device, dtype=torch.long)
+            position_ids = torch.stack((position_ids, block_position_ids),
+                                       dim=0)
+        position_ids = position_ids.unsqueeze(0)
+    else:
+        attention_mask, loss_mask, position_ids = get_masks_and_position_ids(
+            tokens,
+            args.eod_token,
+            reset_position_ids=False,
+            reset_attention_mask=False,
+            set_loss_mask=False,
+            mem_length=args.mem_length)
+
+    return tokens, attention_mask, position_ids
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    # This function has been mostly taken from huggingface conversational ai code at
+    # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                  None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # convert to 1D
+        logits = logits.view(logits.size()[1]).contiguous()
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(
+            F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+            ..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+        # going back to 2D
+        logits = logits.view(1, -1).contiguous()
+
+    return logits
+
+
+def sample_sequence(model,
+                    tokenizer,
+                    context_tokens,
+                    context_length,
+                    args,
+                    device,
+                    mems=None,
+                    end_tokens=None):
+    if not args.block_lm:
+        context_tokens, attention_mask, position_ids = get_batch(
+            context_tokens, device, args)
+        tokens = torch.empty((args.num_beams, 0),
+                             device=context_tokens.device,
+                             dtype=torch.long)
+    else:
+        tokens = context_tokens.new_full((1, 1),
+                                         tokenizer.get_command('sop').Id)
+    counter = 0
+    if mems is None:
+        mems = []
+    if end_tokens is None:
+        end_tokens = [args.eod_token]
+
+    last_beam_num = 1
+    output_tokens_list = []
+    generated_tokens_list = []
+
+    while counter < args.out_seq_length:
+        if counter == 0 and not args.block_lm:
+            next_token_logits, *mems = model(context_tokens, position_ids,
+                                             attention_mask, *mems)
+        else:
+            if args.block_lm:
+                if args.no_block_position:
+                    position_ids = context_tokens.new_full(
+                        (last_beam_num, 1), context_length + counter)
+                else:
+                    position_ids = context_tokens.new_ones(last_beam_num, 2, 1)
+                    position_ids[:, 0] = context_length
+                    position_ids[:, 1] = counter + 1
+                attention_mask = context_tokens.new_zeros(
+                    [1], device=context_tokens.device, dtype=torch.long)
+            else:
+                position_ids = context_tokens.new_ones((last_beam_num, 1)) * (
+                    context_length + counter - 1)
+                attention_mask = context_tokens.new_ones(
+                    last_beam_num,
+                    1,
+                    1,
+                    args.mem_length + 1,
+                    device=context_tokens.device,
+                    dtype=torch.float)
+            last_token = tokens[:, -1:]
+            next_token_logits, *mems = model(last_token, position_ids,
+                                             attention_mask, *mems)
+        next_token_logits = next_token_logits[:, -1]
+
+        next_token_logits /= args.temperature
+        frequency_count = torch.zeros(next_token_logits.shape)
+        for tk in output_tokens_list:
+            frequency_count[0][tk] += 1
+
+        next_token_logits -= (args.frequency_penalty
+                              * frequency_count).to(device)
+        next_token_logits -= (
+            args.presence_penalty *  # noqa
+            (frequency_count > 0)).to(device)
+
+        next_token_logits = top_k_logits(
+            next_token_logits, top_k=args.top_k, top_p=args.top_p)
+        log_probs = F.softmax(next_token_logits, dim=-1)
+        prev = torch.multinomial(log_probs, num_samples=1)[0]
+        is_end = prev.item() in end_tokens
+        if is_end:
+            break
+        decode_tokens = tokenizer.DecodeIds([prev.item()])  # noqa
+        generated_tokens_list.append(prev.item())
+        prev = prev.view(1, 1)
+        tokens = prev if tokens is None else torch.cat((tokens, prev), dim=1)
+        counter += 1
+        output_tokens_list = tokens.view(-1).contiguous()
+    return torch.cat((context_tokens, tokens), dim=1), mems
+
+
+def read_context(tokenizer, args, context):
+    terminate_runs, skip_run = 0, 0  # noqa
+    if mpu.get_model_parallel_rank() == 0:
+        while True:
+            # raw_text = input("\nContext prompt (stop to exit) >>> ")
+            raw_text = context
+            if not raw_text:
+                print('Prompt should not be empty!')
+                break
+            # if raw_text == "stop":
+            #     terminate_runs = 1
+            #     break
+            generation_mask = '[gMASK]' if args.task_mask else '[MASK]'
+            if args.block_lm and 'MASK]' not in raw_text:
+                raw_text += ' ' + generation_mask
+            # output.write(raw_text)
+            context_tokens = tokenizer.EncodeAsIds(raw_text).tokenization
+            if args.block_lm:
+                context_tokens = [tokenizer.get_command('ENC').Id
+                                  ] + context_tokens
+                if not raw_text.endswith('[gMASK]'):
+                    context_tokens = context_tokens + [
+                        tokenizer.get_command('eos').Id
+                    ]
+            context_length = len(context_tokens)
+
+            if context_length >= args.seq_length:
+                print('\nContext length', context_length,
+                      '\nPlease give smaller context than the window length!')
+                break
+            break
+    else:
+        context_length = 0
+
+    terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
+    torch.distributed.broadcast(
+        terminate_runs_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    terminate_runs = terminate_runs_tensor[0].item()
+
+    if terminate_runs == 1:
+        return terminate_runs, None, None, None
+
+    context_length_tensor = torch.cuda.LongTensor([context_length])
+
+    torch.distributed.broadcast(
+        context_length_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    context_length = context_length_tensor[0].item()
+    if mpu.get_model_parallel_rank() == 0:
+        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+    else:
+        context_tokens_tensor = torch.cuda.LongTensor([0] * context_length)
+    torch.distributed.broadcast(
+        context_tokens_tensor,
+        mpu.get_model_parallel_src_rank(),
+        group=mpu.get_model_parallel_group())
+    if mpu.get_model_parallel_rank() != 0:
+        raw_text = tokenizer.DecodeIds(context_tokens_tensor.tolist())
+    return terminate_runs, raw_text, context_tokens_tensor, context_length
+
+
+@MODELS.register_module(Tasks.text_summarization, module_name=Models.mglm)
+class MGLMForTextSummarization(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the text summarization model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        from .configure_data import prepare_tokenizer
+        # Disable CuDNN.
+        torch.backends.cudnn.enabled = False
+        # Arguments.
+        self.args = setup_args(get_args())
+        self.args.load_pretrained = model_dir
+        # Pytorch distributed.
+        try:
+            initialize_distributed(self.args)
+        except (RuntimeError):
+            print('group process initialized twice')
+        # Random seeds for reproducability.
+        set_random_seed(self.args.seed)
+        # setting default batch size to 1
+        self.args.batch_size = 1
+        self.args.tokenizer_path = model_dir
+        self.tokenizer = prepare_tokenizer(self.args)
+        self.model = setup_model(self.args)
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+
+    def forward(self, input: Dict[str, str]) -> Dict[str, str]:
+        pass
+
+    def generate(self, input: Dict[str, str]) -> Dict[str, str]:
+        model = self.model
+        tokenizer = self.tokenizer
+        args = self.args
+        device = torch.cuda.current_device()
+        model.eval()
+
+        context = input['text'] + self.cfg.model.prompt
+        with torch.no_grad():
+            terminate_runs, raw_text, context_tokens_tensor, context_length = read_context(
+                tokenizer, args, context)
+            mems = []
+            tokens, attention_mask, position_ids = get_batch(
+                context_tokens_tensor, device, args)
+            mask_tokens = ['MASK', 'sMASK', 'gMASK'
+                           ] if args.task_mask else ['MASK']
+            mask_tokens = [
+                tokenizer.get_command(token).Id for token in mask_tokens
+            ]
+            end_tokens = [tokenizer.get_command('eop').Id, args.eod_token]
+
+            mask_positions = []
+            for token in mask_tokens:
+                mask_positions += (context_tokens_tensor == token).nonzero(
+                    as_tuple=True)[0].tolist()
+            mask_positions.sort()
+            if args.no_block_position:
+                for mask_position in mask_positions:
+                    position_ids[0, mask_position + 1:] += args.out_seq_length
+            _, *mems = model(tokens, position_ids, attention_mask, *mems)
+            for mask_position in mask_positions:
+                if args.no_block_position:
+                    position = position_ids[0, mask_position].item()
+                else:
+                    position = mask_position
+                tokens, mems, = sample_sequence(
+                    model,
+                    tokenizer,
+                    tokens,
+                    position,
+                    args,
+                    device,
+                    mems=mems,
+                    end_tokens=end_tokens)
+            output_tokens_list = tokens.view(-1).contiguous()
+            trim_decode_tokens = tokenizer.DecodeIds(
+                output_tokens_list.tolist())
+            res = trim_decode_tokens.split('<|startofpiece|>')[-1]
+            print(res)
+        return {OutputKeys.TEXT: res}
--- a/modelscope/models/nlp/mglm/model/init.py
+++ b/modelscope/models/nlp/mglm/model/init.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .distributed import (DistributedDataParallel,
+                          PyTorchDistributedDataParallel)
+from .downstream import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast,
+                         GLMForSequenceClassification, GLMForSingleTokenCloze)
+from .modeling_glm import (GLMModel,
+                           glm_get_params_for_weight_decay_optimization)
--- a/modelscope/models/nlp/mglm/model/distributed.py
+++ b/modelscope/models/nlp/mglm/model/distributed.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.modules import Module
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+
+from modelscope.models.nlp.mglm import mpu
+
+
+class PyTorchDistributedDataParallel(DDP):
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+        self.data_parallel_group = mpu.get_data_parallel_group()
+        src_rank = mpu.get_model_parallel_rank()
+        for p in self.module.parameters():
+            if torch.is_tensor(p):
+                dist.broadcast(p, src_rank, group=self.data_parallel_group)
+
+        def allreduce_params(reduce_after=True,
+                             no_scale=False,
+                             fp32_allreduce=False):
+            if (self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for name, param in self.module.named_parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = (param.data.type())
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print(
+                            'WARNING: gloo dist backend for half parameters may be extremely slow. It is recommended to use the NCCL backend in this case.'  # noqa
+                        )
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    if fp32_allreduce:
+                        coalesced = coalesced.float()
+                    if not no_scale and not reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
+                    torch.cuda.synchronize()
+                    if not no_scale and reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    for buf, synced in zip(
+                            grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        self.hook_handles = []
+        self.hooks = []
+        for param in list(self.module.parameters()):
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+
+        self.allreduce_params = allreduce_params
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.module.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix=prefix, recurse=recurse)
+
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+    def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
--- a/modelscope/models/nlp/mglm/model/downstream.py
+++ b/modelscope/models/nlp/mglm/model/downstream.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2022 Zhipu.AI
+"""Multiple choice model."""
+
+import torch
+import torch.nn
+
+from .modeling_glm import GLMModel
+
+
+class GLMForMultiTokenCloze(torch.nn.Module):
+
+    def __init__(self,
+                 language_model: GLMModel,
+                 take_softmax=True,
+                 length_penalty=0.0):
+        super(GLMForMultiTokenCloze, self).__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+        self.length_penalty = length_penalty
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # [h.remove() for h in self.hook_handles]
+        sd = self.model.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.model.named_parameters(prefix=prefix, recurse=recurse)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                target_ids=None,
+                logit_mask=None,
+                prompt_pos=None):
+        if target_ids is None:
+            return self.model(input_ids, position_ids, attention_mask)
+        num_choices = None
+        if len(input_ids.shape) == 3:
+            batch_size, num_choices = input_ids.shape[:2]
+            input_ids = input_ids.reshape(-1, input_ids.size(-1))
+            attention_mask = attention_mask.reshape(-1,
+                                                    *attention_mask.size()[2:])
+            position_ids = position_ids.reshape(-1, *position_ids.size()[2:])
+            target_ids = target_ids.reshape(-1, target_ids.size(-1))
+            logit_mask = logit_mask.reshape(-1, logit_mask.size(-1))
+            if prompt_pos is not None:
+                prompt_pos = prompt_pos.reshape(-1, prompt_pos.size(-1))
+        outputs, *mems = self.model(
+            input_ids, position_ids, attention_mask, prompt_pos=prompt_pos)
+        if self.take_softmax:
+            outputs = torch.nn.functional.log_softmax(outputs, dim=-1)
+        # select the target logits
+        batch_ids = torch.arange(
+            target_ids.size(0), dtype=torch.long, device=target_ids.device)
+        batch_ids = batch_ids.unsqueeze(1).expand_as(target_ids)
+        seq_ids = torch.arange(
+            target_ids.size(-1), dtype=torch.long, device=target_ids.device)
+        seq_ids = seq_ids.unsqueeze(0).expand_as(target_ids)
+        logits = outputs[batch_ids, seq_ids, target_ids]
+        logits = (logits * logit_mask).sum(dim=1)
+        if self.length_penalty > 0.0:
+            logits = logits / logit_mask.sum(dim=1)**self.length_penalty
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
+
+
+class GLMForMultiTokenClozeFast(torch.nn.Module):
+
+    def __init__(self, language_model, take_softmax=True, length_penalty=0.0):
+        super(GLMForMultiTokenClozeFast, self).__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+        self.length_penalty = length_penalty
+
+    def forward(self, input_ids, position_ids, attention_mask, dec_input_ids,
+                dec_position_ids, dec_attention_mask, dec_target_ids,
+                dec_logit_mask):
+        # encoder
+        outputs, *mems = self.model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            return_memory=True,
+            detach_memory=False)
+        batch_size, num_choices, max_dec_len = dec_input_ids.size()
+        max_enc_len = input_ids.size(-1)
+
+        enc_mems = []
+        for hidden in mems:
+            hidden = hidden.unsqueeze(1).expand(-1, num_choices, -1,
+                                                -1).reshape(
+                                                    batch_size * num_choices,
+                                                    *hidden.size()[1:])
+            enc_mems.append(hidden)
+
+        def build_dec_mask_matrix(seq_length, sep, memory_length=0):
+            m = enc_mems[0].new_ones((1, seq_length, seq_length))
+            m = torch.tril(m)
+
+            # sep = dec_attention_mask
+            ids = torch.arange(
+                memory_length, device=sep.device, dtype=sep.dtype).view(1, -1)
+            mask = ids < sep.view(-1, 1)  # batch * mem
+            mask = mask.unsqueeze(1).float().expand(-1, seq_length, -1)
+
+            m = m.expand(batch_size * num_choices, -1, -1)
+            m = torch.cat((mask, m), dim=2)
+            m = m.unsqueeze(1)
+            return m
+
+        dec_input_ids = dec_input_ids.reshape(-1, max_dec_len)
+        dec_position_ids = dec_position_ids.reshape(
+            -1,
+            *dec_position_ids.size()[2:])
+        # dec_attention_mask = dec_attention_mask.reshape(-1, *dec_attention_mask.size()[2:]).unsqueeze(1)
+        dec_attention_mask = build_dec_mask_matrix(
+            max_dec_len, dec_attention_mask.reshape(-1), max_enc_len)
+        dec_target_ids = dec_target_ids.reshape(-1, dec_target_ids.size(-1))
+        dec_logit_mask = dec_logit_mask.reshape(-1, dec_logit_mask.size(-1))
+
+        outputs, *mems = self.model(dec_input_ids, dec_position_ids,
+                                    dec_attention_mask, *enc_mems)
+        if self.take_softmax:
+            outputs = torch.nn.functional.log_softmax(outputs, dim=-1)
+
+        batch_ids = torch.arange(
+            dec_target_ids.size(0),
+            dtype=torch.long,
+            device=dec_target_ids.device)
+        batch_ids = batch_ids.unsqueeze(1).expand_as(dec_target_ids)
+        seq_ids = torch.arange(
+            dec_target_ids.size(-1),
+            dtype=torch.long,
+            device=dec_target_ids.device)
+        seq_ids = seq_ids.unsqueeze(0).expand_as(dec_target_ids)
+        logits = outputs[batch_ids, seq_ids, dec_target_ids]
+        logits = (logits * dec_logit_mask).sum(dim=1)
+        if self.length_penalty > 0.0:
+            logits = logits / dec_logit_mask.sum(dim=1)**self.length_penalty
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
+
+
+class GLMForSingleTokenCloze(torch.nn.Module):
+
+    def __init__(self, language_model, take_softmax=False):
+        super().__init__()
+        self.model = language_model
+        self.take_softmax = take_softmax
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # [h.remove() for h in self.hook_handles]
+        sd = self.model.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.model.named_parameters(prefix=prefix, recurse=recurse)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                target_ids=None,
+                logit_mask=None,
+                prompt_pos=None):
+        if target_ids is None:
+            return self.model(input_ids, position_ids, attention_mask)
+        assert len(input_ids.shape) == 2
+        outputs, *mems = self.model(
+            input_ids, position_ids, attention_mask, prompt_pos=prompt_pos)
+        batch_ids = torch.arange(
+            outputs.size(0),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device)
+        target_logits = outputs[batch_ids, attention_mask]
+        if self.take_softmax:
+            target_prob = torch.nn.functional.log_softmax(
+                target_logits, dim=-1)
+        else:
+            target_prob = target_logits
+        batch_ids = batch_ids.unsqueeze(1).expand_as(target_ids)
+        output = target_prob[batch_ids, target_ids]
+
+        return (output, target_logits, *mems)
+
+
+class GLMForSequenceClassification(torch.nn.Module):
+
+    def __init__(self,
+                 language_model,
+                 hidden_size,
+                 hidden_dropout,
+                 pool_token,
+                 num_class=1):
+        super().__init__()
+        self.pool_token = pool_token
+        self.model = language_model
+        self.num_class = num_class
+        # Multi-choice head.
+        self.pool_layer = torch.nn.Linear(hidden_size, hidden_size)
+        self.multichoice_dropout = torch.nn.Dropout(hidden_dropout)
+        self.multichoice_head = torch.nn.Linear(hidden_size, num_class)
+
+    def forward(self, input_ids, position_ids, attention_mask):
+        num_choices = None
+        if len(input_ids.shape) == 3:
+            assert self.num_class == 1
+            batch_size, num_choices = input_ids.shape[:2]
+            input_ids = input_ids.reshape(-1, input_ids.size(-1))
+            attention_mask = attention_mask.reshape(-1,
+                                                    *attention_mask.size()[2:])
+            position_ids = position_ids.reshape(-1, *position_ids.size()[2:])
+        outputs, *mems = self.model(input_ids, position_ids, attention_mask)
+        if self.pool_token == 'start':
+            output = outputs[torch.arange(
+                outputs.size(0),
+                dtype=attention_mask.dtype,
+                device=attention_mask.device), attention_mask]
+        elif self.pool_token == 'pad':
+            output = outputs[torch.arange(
+                outputs.size(0),
+                dtype=attention_mask.dtype,
+                device=attention_mask.device), attention_mask - 1]
+        elif self.pool_token == 'cls':
+            output = outputs[:, 0]
+        else:
+            raise NotImplementedError
+        output = torch.tanh(self.pool_layer(output))
+        multichoice_output = self.multichoice_dropout(output)
+        logits = self.multichoice_head(multichoice_output)
+        if num_choices is not None:
+            logits = logits.view(-1, num_choices)
+        return (logits, *mems)
--- a/modelscope/models/nlp/mglm/model/modeling_bert.py
+++ b/modelscope/models/nlp/mglm/model/modeling_bert.py
--- a/modelscope/models/nlp/mglm/model/modeling_glm.py
+++ b/modelscope/models/nlp/mglm/model/modeling_glm.py
@@ -0,0 +1,245 @@
+# Modified by Zhipu.AI
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT-2 model."""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.nlp.mglm import mpu
+from modelscope.models.nlp.mglm.model.prompt import PromptSpell
+from modelscope.models.nlp.mglm.utils import print_rank_0
+
+
+def init_method_normal(std=0.02):
+    """Init method based on normal distribution.
+
+    This is only used for embeddings. The transformer has its
+    own initializer.
+    """
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GLMModel(torch.nn.Module):
+    """GLM Language model.
+
+    The output of the forward method are the logits (parallel or
+    serial depending on the `parallel_output` flag.
+    """
+
+    def __init__(
+        self,
+        num_layers,
+        vocab_size,
+        hidden_size,
+        num_attention_heads,
+        embedding_dropout_prob,
+        attention_dropout_prob,
+        output_dropout_prob,
+        max_sequence_length,
+        max_memory_length,
+        checkpoint_activations,
+        checkpoint_num_layers=1,
+        parallel_output=True,
+        relative_encoding=False,
+        block_position_encoding=False,
+        output_predict=True,
+        spell_length=None,
+        spell_func='lstm',
+        attention_scale=1.0,
+    ):
+
+        super(GLMModel, self).__init__()
+
+        self.parallel_output = parallel_output
+        self.output_predict = output_predict
+        self.hidden_size = hidden_size
+
+        init_method = init_method_normal(std=0.02)
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, hidden_size, init_method=init_method)
+
+        # Transformer
+        self.transformer = mpu.GPT2ParallelTransformer(
+            num_layers,
+            hidden_size,
+            num_attention_heads,
+            max_sequence_length,
+            max_memory_length,
+            embedding_dropout_prob,
+            attention_dropout_prob,
+            output_dropout_prob,
+            checkpoint_activations,
+            checkpoint_num_layers,
+            attention_scale=attention_scale,
+            relative_encoding=relative_encoding,
+            block_position_encoding=block_position_encoding)
+        if spell_length is not None:
+            self.prompt_spell = PromptSpell(spell_length, self.hidden_size,
+                                            spell_func)
+
+    def freeze_transformer(self, tune_prefix_layers=None):
+        log_str = 'Freeze transformer'
+        self.word_embeddings.requires_grad_(False)
+        self.transformer.requires_grad_(False)
+        if tune_prefix_layers is not None:
+            log_str += f' tune {tune_prefix_layers} prefix layers'
+            for i in range(tune_prefix_layers):
+                self.transformer.layers[i].requires_grad_(True)
+        print_rank_0(log_str)
+
+    def forward(self,
+                input_ids,
+                position_ids,
+                attention_mask,
+                *mems,
+                return_memory=False,
+                detach_memory=True,
+                prompt_pos=None):
+        # Embeddings.
+        batch_size = input_ids.size(0)
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        if prompt_pos is not None:
+            embeddings = embeddings.clone()
+            prompt_embeds = self.prompt_spell()
+            batch_index = torch.arange(
+                batch_size, device=input_ids.device).unsqueeze(1)
+            embeddings[batch_index, prompt_pos] = prompt_embeds
+        # Transformer.
+        transformer_output = self.transformer(
+            embeddings,
+            position_ids,
+            attention_mask,
+            mems,
+            return_memory=return_memory,
+            detach_memory=detach_memory)
+        logits, hidden_layers = transformer_output
+        outputs = hidden_layers
+
+        if self.output_predict:
+            # Parallel logits.
+            logits_parallel = mpu.copy_to_model_parallel_region(logits)
+            logits_parallel = F.linear(logits_parallel,
+                                       self.word_embeddings.weight)
+
+            if self.parallel_output:
+                return (logits_parallel, *outputs)
+
+            return (mpu.gather_from_model_parallel_region(logits_parallel),
+                    *outputs)
+        else:
+            return (logits, *outputs)
+
+
+class EncoderDecoder(torch.nn.Module):
+    """Seq2Seq Transformer Model
+    The output of the forward method are the logits (parallel or serial depending on the `parallel_output` flag).
+    """
+
+    def __init__(self,
+                 num_layers,
+                 vocab_size,
+                 hidden_size,
+                 num_attention_heads,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 max_sequence_length,
+                 max_memory_length,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 parallel_output=True,
+                 output_predict=True):
+        super(EncoderDecoder, self).__init__()
+
+        self.parallel_output = parallel_output
+        self.output_predict = output_predict
+
+        init_method = init_method_normal(std=0.02)
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, hidden_size, init_method=init_method)
+
+        # Transformer
+        self.encoder = mpu.GPT2ParallelTransformer(
+            num_layers, hidden_size, num_attention_heads, max_sequence_length,
+            max_memory_length, embedding_dropout_prob, attention_dropout_prob,
+            output_dropout_prob, checkpoint_activations, checkpoint_num_layers)
+        self.decoder = mpu.GPT2ParallelTransformer(
+            num_layers,
+            hidden_size,
+            num_attention_heads,
+            max_sequence_length,
+            max_memory_length,
+            embedding_dropout_prob,
+            attention_dropout_prob,
+            output_dropout_prob,
+            checkpoint_activations,
+            checkpoint_num_layers,
+            use_decoder_layer=True)
+
+    def forward(self, source_ids, target_ids, source_position_ids,
+                target_position_ids, source_mask, target_mask):
+        # Embeddings.
+        source_embeddings = self.word_embeddings(source_ids)
+        target_embeddings = self.word_embeddings(target_ids)
+
+        # Transformer.
+        encoder_output, _ = self.encoder(source_embeddings,
+                                         source_position_ids, source_mask)
+        decoder_output, _ = self.decoder(target_embeddings,
+                                         target_position_ids, target_mask)
+        if self.output_predict:
+            # Parallel logits.
+            output_parallel = mpu.copy_to_model_parallel_region(decoder_output)
+            logits_parallel = F.linear(output_parallel,
+                                       self.word_embeddings.weight)
+
+            if self.parallel_output:
+                return (logits_parallel, )
+
+            return (mpu.gather_from_model_parallel_region(logits_parallel), )
+        else:
+            return (decoder_output, )
+
+
+def glm_get_params_for_weight_decay_optimization(module):
+    weight_decay_params = {'params': []}
+    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    for module_ in module.modules():
+        if isinstance(module_, (mpu.LayerNorm, torch.nn.LayerNorm)):
+            no_weight_decay_params['params'].extend([
+                p for p in list(module_._parameters.values())
+                if p is not None and p.requires_grad
+            ])
+        else:
+            weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and p.requires_grad and n != 'bias'
+            ])
+            no_weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and p.requires_grad and n == 'bias'
+            ])
+
+    return weight_decay_params, no_weight_decay_params
--- a/modelscope/models/nlp/mglm/model/prompt.py
+++ b/modelscope/models/nlp/mglm/model/prompt.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import random
+
+import torch
+
+
+class PromptSpell(torch.nn.Module):
+
+    def __init__(self, spell_length, hidden_size, spell_func):
+        super(PromptSpell, self).__init__()
+        self.spell_length = spell_length
+        self.hidden_size = hidden_size
+        self.spell_embeddings = torch.nn.Embedding(self.spell_length,
+                                                   self.hidden_size)
+        self.spell_func = spell_func
+        if self.spell_func == 'lstm':
+            self.lstm_head = torch.nn.LSTM(
+                input_size=self.hidden_size,
+                hidden_size=self.hidden_size,
+                num_layers=2,
+                # dropout=self.lstm_dropout,
+                bidirectional=True,
+                batch_first=True)  # .to(torch.device("cuda"))
+            self.mlp_head = torch.nn.Sequential(
+                torch.nn.Linear(2 * self.hidden_size, self.hidden_size),
+                torch.nn.ReLU(),
+                torch.nn.Linear(self.hidden_size, self.hidden_size))
+        elif self.spell_func == 'mlp':
+            self.mlp_head = torch.nn.Sequential(
+                torch.nn.Linear(self.hidden_size, self.hidden_size),
+                torch.nn.ReLU(),
+                torch.nn.Linear(self.hidden_size, self.hidden_size))
+        elif self.spell_func != 'none':
+            raise NotImplementedError('Prompt function ' + self.spell_func)
+
+    def init_embedding(self, word_embeddings=None, task_tokens=None):
+        num_words = 5000
+        with torch.no_grad():
+            for i in range(self.spell_length):
+                rand_token = random.randrange(num_words)
+                if task_tokens is None:
+                    target_embedding = word_embeddings[rand_token]
+                else:
+                    word_embedding = word_embeddings[rand_token]
+                    task_token = random.choice(task_tokens)
+                    task_embedding = word_embeddings[task_token]
+                    ratio = random.random()
+                    target_embedding = word_embedding * ratio + task_embedding * (
+                        1 - ratio)
+                self.spell_embeddings.weight.data[i] = target_embedding
+
+    def forward(self):
+        prompt_embeds = self.spell_embeddings.weight.unsqueeze(0)
+        if self.spell_func == 'lstm':
+            prompt_embeds = self.lstm_head(prompt_embeds)[0]
+        if self.spell_func == 'lstm' or self.spell_func == 'mlp':
+            prompt_embeds = self.mlp_head(prompt_embeds)
+        return prompt_embeds
--- a/modelscope/models/nlp/mglm/mpu/init.py
+++ b/modelscope/models/nlp/mglm/mpu/init.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model parallel utility interface."""
+
+from .cross_entropy import vocab_parallel_cross_entropy
+from .data import broadcast_data
+from .grads import clip_grad_norm
+from .initialize import (destroy_model_parallel, get_data_parallel_group,
+                         get_data_parallel_rank, get_data_parallel_world_size,
+                         get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_src_rank,
+                         get_model_parallel_world_size,
+                         initialize_model_parallel,
+                         model_parallel_is_initialized)
+from .layers import (ColumnParallelLinear, ParallelEmbedding,
+                     RowParallelLinear, VocabParallelEmbedding)
+from .mappings import (copy_to_model_parallel_region,
+                       gather_from_model_parallel_region,
+                       reduce_from_model_parallel_region,
+                       scatter_to_model_parallel_region)
+from .random import (checkpoint, get_cuda_rng_tracker,
+                     model_parallel_cuda_manual_seed,
+                     partition_activations_in_checkpoint)
+from .transformer import (BertParallelSelfAttention,
+                          BertParallelTransformerLayer,
+                          GPT2ParallelTransformer, LayerNorm)
--- a/modelscope/models/nlp/mglm/mpu/cross_entropy.py
+++ b/modelscope/models/nlp/mglm/mpu/cross_entropy.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_world_size)
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Copy so the input remains unchanged.
+        logits = vocab_parallel_logits.clone()
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(logits, dim=-1)[0]
+        torch.distributed.all_reduce(
+            logits_max,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group())
+        # Subtract the maximum value.
+        logits.sub_(logits_max.unsqueeze(dim=-1))
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = logits.exp()
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(
+            sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_model_parallel_rank()
+        world_size = get_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (
+            target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(
+            start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(
+            predicted_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(
+            start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d,
+                masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
--- a/modelscope/models/nlp/mglm/mpu/data.py
+++ b/modelscope/models/nlp/mglm/mpu/data.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_src_rank)
+
+_MAX_DATA_DIM = 5
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, '{} has data type {} which '\
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim(
+            ) < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda,
+        get_model_parallel_src_rank(),
+        group=get_model_parallel_group())
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(
+        keys, data)
+
+    # Pack on rank zero.
+    if get_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(
+            total_numel, device=torch.cuda.current_device(), dtype=datatype)
+
+    # Boradcast
+    torch.distributed.broadcast(
+        flatten_data,
+        get_model_parallel_src_rank(),
+        group=get_model_parallel_group())
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
--- a/modelscope/models/nlp/mglm/mpu/grads.py
+++ b/modelscope/models/nlp/mglm/mpu/grads.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import torch
+from torch._six import inf
+
+from .initialize import get_model_parallel_group, get_model_parallel_rank
+
+
+def clip_grad_norm(parameters, max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in parameters)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all GPUs.
+        torch.distributed.all_reduce(
+            total_norm_cuda,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+    else:
+        total_norm = 0
+        for p in parameters:
+            if p.model_parallel or (get_model_parallel_rank() == 0):
+                param_norm = p.grad.data.norm(norm_type)
+                total_norm += param_norm.item()**norm_type
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        torch.distributed.all_reduce(
+            total_norm_cuda,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        for p in parameters:
+            p.grad.data.mul_(clip_coef)
+    return total_norm
--- a/modelscope/models/nlp/mglm/mpu/initialize.py
+++ b/modelscope/models/nlp/mglm/mpu/initialize.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+# Model parallel group that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+
+def initialize_model_parallel(model_parallel_size_):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        model_parallel_size: number of GPUs used to parallelize model.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model. The present function will
+    create 4 model parallel groups and 2 data parallel grous as:
+        4 model parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 data parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel with size {}'.format(
+            model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = min(model_parallel_size_, world_size)
+    ensure_divisibility(world_size, model_parallel_size)
+    rank = torch.distributed.get_rank()
+
+    # Build the data parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, \
+        'data parallel group is already initialized'
+    for i in range(model_parallel_size):
+        ranks = range(i, world_size, model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank % model_parallel_size):
+            _DATA_PARALLEL_GROUP = group
+
+    # Build the model parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, \
+        'model parallel group is already initialized'
+    for i in range(world_size // model_parallel_size):
+        ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank // model_parallel_size):
+            _MODEL_PARALLEL_GROUP = group
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_model_parallel_world_size():
+    """Return world size for the model parallel group."""
+    return torch.distributed.get_world_size(group=get_model_parallel_group())
+
+
+def get_model_parallel_rank():
+    """Return my rank for the model parallel group."""
+    return torch.distributed.get_rank(group=get_model_parallel_group())
+
+
+def get_model_parallel_src_rank():
+    """Calculate the global rank corresponding to a local rank zeor
+    in the model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
--- a/modelscope/models/nlp/mglm/mpu/layers.py
+++ b/modelscope/models/nlp/mglm/mpu/layers.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+from torch.nn.parameter import Parameter
+
+from .initialize import get_model_parallel_rank, get_model_parallel_world_size
+from .mappings import (copy_to_model_parallel_region,
+                       gather_from_model_parallel_region,
+                       reduce_from_model_parallel_region,
+                       scatter_to_model_parallel_region)
+from .random import get_cuda_rng_tracker
+from .utils import VocabUtility, divide, split_tensor_along_last_dim
+
+
+def _initialize_affine_weight(weight,
+                              output_size,
+                              input_size,
+                              per_partition_size,
+                              partition_dim,
+                              init_method,
+                              stride=1,
+                              return_master_weight=False):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+    # If we only use 1 process for model parallelism, bypass scatter.
+    world_size = get_model_parallel_world_size()
+    if world_size == 1:
+        init_method(weight)
+        if return_master_weight:
+            return weight
+        return None
+
+    # Initialize master weight
+    master_weight = torch.empty(
+        output_size, input_size, dtype=weight.dtype, requires_grad=False)
+    init_method(master_weight)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(
+        master_weight, per_partition_per_stride_size, dim=partition_dim)
+    rank = get_model_parallel_rank()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 init_method=init.xavier_normal_):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_model_parallel_rank(),
+                get_model_parallel_world_size())
+        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index  # noqa
+
+        # Allocate weights.
+        self.weight = Parameter(
+            torch.Tensor(self.num_embeddings_per_partition,
+                         self.embedding_dim))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(self.weight, self.num_embeddings,
+                                  self.embedding_dim,
+                                  self.num_embeddings_per_partition, 0,
+                                  init_method)
+
+    def forward(self, input_):
+        # Build the mask.
+        input_mask = (input_ < self.vocab_start_index) | \
+                     (input_ >= self.vocab_end_index)
+        # Mask the input.
+        masked_input = input_.clone() - self.vocab_start_index
+        masked_input[input_mask] = 0
+        # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the embedding dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 init_method=init.xavier_normal_,
+                 keep_master_weight_for_test=False):
+        super(ParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set some detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the embedding dimension.
+        world_size = get_model_parallel_world_size()
+        self.embedding_dim_per_partition = divide(self.embedding_dim,
+                                                  world_size)
+
+        # Allocate weights.
+        self.weight = Parameter(
+            torch.Tensor(self.num_embeddings,
+                         self.embedding_dim_per_partition))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(
+            self.weight,
+            self.num_embeddings,
+            self.embedding_dim,
+            self.embedding_dim_per_partition,
+            1,
+            init_method,
+            stride=1,
+            return_master_weight=False)
+
+    def forward(self, input_):
+        input_parallel = copy_to_model_parallel_region(input_)
+        output_parallel = F.embedding(input_parallel, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        output = gather_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 gather_output=True,
+                 init_method=init.xavier_normal_,
+                 stride=1,
+                 keep_master_weight_for_test=False):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(
+            torch.Tensor(self.output_size_per_partition, self.input_size))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size_per_partition))
+            self.bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight,
+            self.output_size,
+            self.input_size,
+            self.output_size_per_partition,
+            0,
+            init_method,
+            stride=stride,
+            return_master_weight=keep_master_weight_for_test)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight, self.bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_,
+                 stride=1,
+                 keep_master_weight_for_test=False):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(
+            torch.Tensor(self.output_size, self.input_size_per_partition))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size))
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight,
+            self.output_size,
+            self.input_size,
+            self.input_size_per_partition,
+            1,
+            init_method,
+            stride=stride,
+            return_master_weight=keep_master_weight_for_test)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        output_ = reduce_from_model_parallel_region(output_parallel)
+        if self.bias is not None:
+            output = output_ + self.bias
+        else:
+            output = output_
+        return output
--- a/modelscope/models/nlp/mglm/mpu/mappings.py
+++ b/modelscope/models/nlp/mglm/mpu/mappings.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=group)
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Split along last dimension.
+    world_size = torch.distributed.get_world_size(group=group)
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = torch.distributed.get_rank(group=group)
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = torch.distributed.get_rank(group=group)
+    world_size = torch.distributed.get_world_size(group=group)
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=group)
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-redcue the input from the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+
+def copy_to_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
--- a/modelscope/models/nlp/mglm/mpu/random.py
+++ b/modelscope/models/nlp/mglm/mpu/random.py
@@ -0,0 +1,408 @@
+# Modified by Samyam Rajbhandari
+# Used to partition the activations stored for backward propagation
+# Therefore reduces the memory consumption
+
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+import contextlib
+
+import torch
+import torch.distributed as dist
+from torch import _C
+from torch.cuda import _lazy_call
+from torch.cuda import device as device_ctx_manager
+
+from .initialize import (get_data_parallel_rank, get_model_parallel_group,
+                         get_model_parallel_rank,
+                         get_model_parallel_world_size)
+
+# from torch.utils.checkpoint import detach_variable
+
+PARTITION_ACTIVATIONS = False
+PA_CORRECTNESS_TEST = False
+
+
+def see_memory_usage(message, force=False):
+    if not force:
+        return
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print(message)
+        print('Memory Allocated ',
+              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Max Memory Allocated ',
+              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Cache Allocated ',
+              torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
+        print('Max cache Allocated ',
+              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print(' ')
+        # input("Press Any Key To Continue ..")
+
+
+mp_rank = None  # get_model_parallel_rank()
+mp_size = None  # get_model_parallel_world_size()
+mp_group = None  # get_model_parallel_group()
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+transport_stream = None
+cuda_device = None
+
+
+def detach_variable(inputs, device=None):
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+
+            requires_grad = inp.requires_grad
+
+            if device is not None:
+                x = inp.to(device=device)
+            else:
+                x = inp
+
+            x = x.detach()
+            x.requires_grad = requires_grad
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            'Only tuple of tensors is supported. Got Unsupported input type: ',
+            type(inputs).__name__)
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-model-parallel regions.
+        model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    model_parallel_seed = offset + get_model_parallel_rank()
+    # Data parallel gets the original sedd.
+    data_parallel_seed = seed
+
+    if torch.distributed.get_rank() == 0:
+        print(
+            '> initializing model parallel cuda seeds on global rank {}, '
+            'model parallel rank {}, and data parallel rank {} with '
+            'model parallel seed: {} and data parallel seed: {}'.format(
+                torch.distributed.get_rank(), get_model_parallel_rank(),
+                get_data_parallel_rank(), model_parallel_seed,
+                data_parallel_seed),
+            flush=True)
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                model_parallel_seed)
+
+
+def get_partition_start(item):
+    global mp_rank, mp_size, mp_group
+    partition_size = get_partition_size(item)
+    start = partition_size * mp_rank
+    return int(start)
+
+
+def get_partition_size(item):
+    global mp_rank, mp_size, mp_group
+    size = item.numel()
+    partition_size = size / mp_size
+    return int(partition_size)
+
+
+def get_full_inputs(tensors):
+    inputs = []
+    for i in range(int(len(tensors) / 2) - 1):
+        item = tensors[2 * i]
+        size = tensors[2 * i + 1]
+        partition_size = item.numel()
+        tensor_size = partition_size * mp_size
+        flat_tensor = torch.zeros([tensor_size],
+                                  dtype=item.dtype,
+                                  device=item.device)
+        partitions = []
+        for i in range(mp_size):
+            part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
+            if i == mp_rank:
+                part_i.copy_(item)
+            partitions.append(part_i)
+        dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
+        input_tensor = flat_tensor.view(list(size.numpy()))
+        item.data = input_tensor.data
+
+        inputs.append(item)
+    inputs.append(tensors[-2])
+
+    return tuple(inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+        global mp_rank, mp_size, mp_group
+        if mp_rank is None:
+            mp_rank = get_model_parallel_rank()
+            mp_size = get_model_parallel_world_size()
+            mp_group = get_model_parallel_group()
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+        if cuda_device is None:
+            if dist.get_rank() == 0:
+                print(
+                    f'Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}'
+                )
+
+            cuda_device = torch.cuda.current_device()
+            # The transport stream is used to overlap the allgather communication for the activations
+            # with the computation in the backward pass
+            transport_stream = torch.cuda.Stream(device=cuda_device)
+
+        if PARTITION_ACTIVATIONS:
+            inputs = [
+                item.detach().contiguous().view(-1).narrow(
+                    0, get_partition_start(item),
+                    get_partition_size(item)).clone() for item in args[:-1]
+            ]
+            inputs.append(args[-1])
+
+        # just in case something funky is happening such as reuse of inputs
+        inputs_cuda = [item.to(cuda_device) for item in args]
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # ctx.save_for_backward(*args)
+        with torch.no_grad():
+            outputs = run_function(*inputs_cuda)
+
+        del inputs_cuda
+
+        if PARTITION_ACTIVATIONS:
+            new_args = []
+            for arg, inp in zip(args, inputs):
+                size = torch.tensor(arg.size())
+                arg.data = inp.data
+                new_args.append(arg)
+                new_args.append(size)
+            ctx.save_for_backward(*new_args)
+        else:
+            ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError('Checkpointing is not compatible with .grad(), '
+                               'please use .backward() if possible')
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+
+        if PARTITION_ACTIVATIONS:
+            with torch.cuda.stream(transport_stream):
+                inputs = get_full_inputs(ctx.saved_tensors)
+                detached_inputs = detach_variable(inputs)
+        else:
+            inputs = ctx.saved_tensors
+            detached_inputs = detach_variable(inputs)
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        if PARTITION_ACTIVATIONS:
+            current_stream = torch.cuda.current_stream()
+            current_stream.wait_stream(transport_stream)
+
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs, )
+        torch.autograd.backward(outputs, args)
+        return (None, ) + tuple(inp.grad for inp in detached_inputs)
+
+
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
+
+
+def partition_activations_in_checkpoint(partition_activation):
+    global PARTITION_ACTIVATIONS
+    PARTITION_ACTIVATIONS = partition_activation
+    if dist.get_rank() == 0:
+        print(
+            f'**************Partition Activations {PARTITION_ACTIVATIONS}************'
+        )
--- a/modelscope/models/nlp/mglm/mpu/tests/init.py
+++ b/modelscope/models/nlp/mglm/mpu/tests/init.py
--- a/modelscope/models/nlp/mglm/mpu/tests/commons.py
+++ b/modelscope/models/nlp/mglm/mpu/tests/commons.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+
+import mpu
+import numpy
+import torch
+
+
+class IdentityLayer(torch.nn.Module):
+
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--local_rank',
+        type=int,
+        default=None,
+        help='local rank passed from distributed launcher')
+    args = parser.parse_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv('WORLD_SIZE', '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
--- a/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+import mpu
+import torch
+import torch.nn.functional as F
+from commons import (IdentityLayer, initialize_distributed, print_separator,
+                     set_random_seed)
+from mpu.cross_entropy import vocab_parallel_cross_entropy
+
+sys.path.append('../..')
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale,
+                        seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(size=(batch_size,
+                                         seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(
+        logits.view(-1,
+                    logits.size()[-1]), target.view(-1),
+        reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = mpu.scatter_to_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(size=(batch_size,
+                                         seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
+                                                 vocab_size, logits_scale,
+                                                 seed)
+    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, vocab_size,
+                                           logits_scale, seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/mglm/mpu/tests/test_data.py
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_data.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+from mpu import data as data_utils
+
+sys.path.append('../..')
+
+
+def test_boradcast_data(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print(
+            '> testing boradcast_data with model parallel size {} ...'.format(
+                model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    key_size_t = {
+        'key1': [7, 11],
+        'key2': [8, 2, 1],
+        'key3': [13],
+        'key4': [5, 1, 2],
+        'key5': [5, 12]
+    }
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if mpu.get_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test test boradcast data')
+        test_boradcast_data(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+
+sys.path.append('../..')
+
+
+def test_initialize_model_parallel(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_model_parallel with size {} ...'.format(
+            model_parallel_size))
+    model_parallel_size_ = min(model_parallel_size,
+                               torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size_)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks.
+    def check(group, world_size, rank):
+        assert world_size == torch.distributed.get_world_size(group=group)
+        assert rank == torch.distributed.get_rank(group=group)
+
+    # Model parallel.
+    world_size = model_parallel_size_
+    rank = torch.distributed.get_rank() % model_parallel_size_
+    assert world_size == mpu.get_model_parallel_world_size()
+    assert rank == mpu.get_model_parallel_rank()
+    check(mpu.get_model_parallel_group(), world_size, rank)
+
+    # Data parallel.
+    world_size = torch.distributed.get_world_size() // model_parallel_size_
+    rank = torch.distributed.get_rank() // model_parallel_size
+    assert world_size == mpu.get_data_parallel_world_size()
+    assert rank == mpu.get_data_parallel_rank()
+    check(mpu.get_data_parallel_group(), world_size, rank)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_get_model_parallel_src_rank(model_parallel_size_):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing get_model_parallel_src_rank with size {} ...'.format(
+            model_parallel_size_))
+    model_parallel_size = min(model_parallel_size_,
+                              torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks
+    src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
+    assert mpu.get_model_parallel_src_rank() == src_rank
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test initialize model parallel')
+        test_initialize_model_parallel(model_parallel_size)
+        print_separator('test model parallel source rank')
+        test_get_model_parallel_src_rank(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/mglm/mpu/tests/test_layers.py
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_layers.py
@@ -0,0 +1,533 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+import mpu
+import torch
+import torch.nn.init as init
+from commons import initialize_distributed, print_separator, set_random_seed
+from mpu import layers
+from torch.nn.parameter import Parameter
+
+sys.path.append('../..')
+
+
+def test_parallel_embedding(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing parallel embedding with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 17
+    seq_length = 23
+    vocab_size = 48
+    hidden_size = 16
+    seed = 1236
+
+    set_random_seed(123)
+    input_data = torch.LongTensor(size=(batch_size, seq_length)).random_(
+        0, vocab_size).cuda()
+    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
+
+    set_random_seed(seed)
+    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
+
+    output = embedding_original(input_data)
+    loss_original = torch.mul(output, loss_weight).sum()
+    loss_original.backward()
+
+    set_random_seed(seed)
+    embedding_parallel = layers.ParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_parallel(input_data)
+    loss_parallel = torch.mul(output, loss_weight).sum()
+    loss_parallel.backward()
+
+    set_random_seed(seed)
+    embedding_vocab_parallel = layers.VocabParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_vocab_parallel(input_data)
+    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
+    loss_vocab_parallel.backward()
+
+    torch.distributed.barrier()
+    error = loss_parallel.sub(loss_original).abs()
+    print('   error in loss (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    error = loss_vocab_parallel.sub(loss_original).abs()
+    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   hidden_size // model_parallel_size,
+                                   1)[mpu.get_model_parallel_rank()]
+    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
+    print('   error in grad (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   vocab_size // model_parallel_size,
+                                   0)[mpu.get_model_parallel_rank()]
+    error = embedding_vocab_parallel.weight.grad.sub(
+        weight_grad_orig).abs().max()
+    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_initialize_affine_weight(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_affine_weight with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+
+    # ---------------
+    # Column parallel
+    # ---------------
+    weight = torch.empty(output_size_coeff, input_size)
+    set_random_seed(seed)
+    layers._initialize_affine_weight(weight, output_size, input_size,
+                                     output_size_coeff, 0,
+                                     torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(
+        master_weight, output_size_coeff, dim=0)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   column parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # ------------
+    # Row parallel
+    # ------------
+    weight = torch.empty(output_size, input_size_coeff)
+    set_random_seed(seed)
+    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
+                                         input_size_coeff, 1,
+                                         torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(
+        master_weight, input_size_coeff, dim=1)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   row parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer2D(torch.nn.Module):
+
+    def __init__(self, m, n):
+        super(IdentityLayer2D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def test_column_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing ColumnParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.ColumnParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(
+        dLdA, output_size_coeff, dim=0)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    my_dLdb = torch.split(
+        dLdb, output_size_coeff, dim=0)[rank].contiguous().clone()
+    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def test_row_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing RowParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.RowParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(
+        dLdA, input_size_coeff, dim=1)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer3D(torch.nn.Module):
+
+    def __init__(self, m, n, k):
+        super(IdentityLayer3D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n, k))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
+                            hidden_size_per_att_head, dropout_prob, batch_size,
+                            sequence_length):
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size(
+    )  # noqa
+    hidden_size = hidden_size_per_att_head * num_att_heads
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
+                                                    dropout_prob).cuda()
+    loss_weight = torch.randn([batch_size, sequence_length,
+                               hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = attention_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer
+
+
+def test_parallel_self_attention(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelSelfAttention with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    dropout_prob = 0.0  # has to be zero
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
+        attention_layer_1, identity_layer_1 = parallel_self_attention(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer = parallel_self_attention(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+    assert hideen_size_1 == hidden_size
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    my_lin_grad_list = torch.split(
+        attention_layer_1.query_key_value.weight.grad,
+        hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
+    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
+    error = my_lin_grad.sub(
+        attention_layer.query_key_value.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   weight gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
+                         hidden_size_per_att_head, batch_size,
+                         sequence_length):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size(
+    )
+    hidden_size = hidden_size_per_att_head * num_att_heads
+    intermediate_size = 4 * hidden_size
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    transformer_layer = mpu.BertParallelTransformerLayer(
+        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
+        torch.nn.functional.relu, 1.0e-5).cuda()
+
+    loss_weight = torch.randn([batch_size, sequence_length,
+                               hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = transformer_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer
+
+
+def test_parallel_transformer_layer(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelTransformerLayer with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
+        transformer_layer_1, identity_layer_1 = parallel_transformer(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer = parallel_transformer(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    print_separator('test initialize affine weight')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_initialize_affine_weight(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test parallel embedding')
+        test_parallel_embedding(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test column-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_column_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test row-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_row_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel self-attention')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_self_attention(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel transformer')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_transformer_layer(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/mglm/mpu/tests/test_random.py
+++ b/modelscope/models/nlp/mglm/mpu/tests/test_random.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import mpu
+import torch
+from commons import initialize_distributed, print_separator
+
+sys.path.append('../..')
+
+
+def test_set_cuda_rng_state(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing set_rng_state with size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    size = 123
+    seed = 1234
+    torch.cuda.manual_seed(seed)
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Get the state
+    rng_state = torch.cuda.get_rng_state()
+    rng_state_copy = rng_state.clone()
+
+    # Do some stuff.
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_1 = tensor.clone()
+
+    assert rng_state.sub(rng_state_copy).max() == 0
+    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+
+    # State should be different.
+    new_rng_state = torch.cuda.get_rng_state()
+    max_diff = new_rng_state.sub(rng_state).max()
+    print(
+        '   max diff in rng state (should be non-zero) on global rank {}: {}'.
+        format(torch.distributed.get_rank(), max_diff))
+    assert max_diff > 0
+
+    # Reset the rng state and do the same stuff.
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_2 = tensor.clone()
+
+    # Results should be the same
+    error = result_2.sub(result_1).abs().max()
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Input state should have remained intact.
+    error = rng_state.sub(rng_state_copy).max()
+    print('   max error in rng state (should be zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), error))
+    assert error == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_cuda_rng_tracker(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cuda rng tracker with size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed_1 = 1234
+    seed_2 = 4321
+    size = [12, 21]
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Set to seed_1 and generate two tensors.
+    torch.cuda.manual_seed(seed_1)
+    torch.randn(size, out=tensor)
+    target_11 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_12 = tensor.clone()
+
+    # Set to seed_2 and generate two tensors.
+    torch.cuda.manual_seed(seed_2)
+    torch.randn(size, out=tensor)
+    target_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_22 = tensor.clone()
+
+    # Now if we interleave seed_1 and seed_2,
+    # we should still get the same tensors
+    torch.cuda.manual_seed(seed_1)
+    mpu.get_cuda_rng_tracker().add('test', seed_2)
+
+    torch.randn(size, out=tensor)
+    result_11 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_21 = tensor.clone()
+
+    torch.randn(size, out=tensor)
+    result_12 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_22 = tensor.clone()
+
+    diff = result_11.sub(result_21).abs().max()
+    diff = min(diff, result_12.sub(result_22).abs().max())
+    print('   max diff in generated tensors (should be non-zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
+    assert diff > 1.0e-6
+    error = max(
+        result_11.sub(target_11).abs().max(),
+        result_12.sub(target_12).abs().max())
+    error = max(error, result_21.sub(target_21).abs().max())
+    error = max(error, result_22.sub(target_22).abs().max())
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_model_parallel_cuda_manual_seed(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing model parallel cuda manual seed with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    mpu.model_parallel_cuda_manual_seed(12345)
+    assert torch.cuda.initial_seed() == 12345
+    with mpu.get_cuda_rng_tracker().fork():
+        assert torch.cuda.initial_seed() == (12345 + 2718
+                                             + mpu.get_model_parallel_rank())
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test set rng state')
+        test_set_cuda_rng_state(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cuda rng tracker')
+        test_cuda_rng_tracker(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test model parallel cuda manual seed')
+        test_model_parallel_cuda_manual_seed(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/mglm/mpu/transformer.py
+++ b/modelscope/models/nlp/mglm/mpu/transformer.py
--- a/modelscope/models/nlp/mglm/mpu/utils.py
+++ b/modelscope/models/nlp/mglm/mpu/utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor,
+                                num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                  rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank,
+                                           world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size)
--- a/modelscope/models/nlp/mglm/process_grid.py
+++ b/modelscope/models/nlp/mglm/process_grid.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import glob
+import os
+import statistics
+import sys
+
+import json
+
+path_pattern = sys.argv[1]
+target_type = sys.argv[2]
+best_value, best_result, best_name = None, None, None
+mean_result = {}
+print(path_pattern)
+for dir_path in glob.glob(path_pattern, recursive=True):
+    entry = os.path.basename(dir_path)
+    valid_result = None
+    test_found = os.path.exists(os.path.join(dir_path, 'test_results.json'))
+    valid_path = os.path.join(dir_path, 'results.json')
+    if os.path.exists(valid_path):
+        print(entry)
+        with open(valid_path) as file:
+            valid_result = json.load(file)
+    else:
+        print(f'{entry} no validation results')
+        continue
+    if not test_found:
+        print(f'{entry} not tested yet')
+    if target_type == 'max':
+        metric = sys.argv[3]
+        metric_value = valid_result[metric]
+        if best_value is None or metric_value > best_value:
+            best_value = metric_value
+            best_result = valid_result
+            best_name = entry
+    elif target_type == 'mean' or target_type == 'median':
+        if mean_result:
+            for metric, value in valid_result.items():
+                if metric not in ['type', 'epoch']:
+                    mean_result[metric].append(value)
+        else:
+            mean_result = {
+                metric: [value]
+                for metric, value in valid_result.items()
+                if metric not in ['type', 'epoch']
+            }
+
+if target_type == 'max':
+    print(f'Best result found at {best_name}: {best_result}')
+elif target_type == 'mean':
+    mean_result = {
+        metric: sum(value) / len(value)
+        for metric, value in mean_result.items()
+    }
+    print(f'Mean result {mean_result}')
+elif target_type == 'median':
+    mean_result = {
+        metric: statistics.median(value)
+        for metric, value in mean_result.items()
+    }
+    print(f'Mean result {mean_result}')
--- a/modelscope/models/nlp/mglm/requirements.txt
+++ b/modelscope/models/nlp/mglm/requirements.txt
@@ -0,0 +1,22 @@
+boto3
+botocore
+deepspeed
+fasttext
+filelock
+ftfy
+langdetect
+lsh
+matplotlib
+mpi4py
+nltk
+pandas
+regex
+requests
+rouge_score
+scikit_learn
+scipy
+sentencepiece
+termcolor
+tldextract
+tqdm
+transformers
--- a/modelscope/models/nlp/mglm/run_test.py
+++ b/modelscope/models/nlp/mglm/run_test.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import sys
+
+if sys.argv[1] == 'block':
+    from test.test_block import main
+    main()
+elif sys.argv[1] == 'rel_shift':
+    from test.test_rel_shift import main
+    main()
--- a/modelscope/models/nlp/mglm/tasks/data_utils.py
+++ b/modelscope/models/nlp/mglm/tasks/data_utils.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tasks data utility."""
+import copy
+import pickle
+import re
+from typing import Dict, List, Optional
+
+import json
+import numpy as np
+import torch
+import torch.utils.data
+from torch.utils.data.dataloader import default_collate
+
+from modelscope.models.nlp.mglm import mpu
+
+
+def clean_text(text):
+    """Remove new lines and multiple spaces and adjust end of sentence dot."""
+
+    text = text.replace('\n', ' ')
+    text = re.sub(r'\s+', ' ', text)
+    for _ in range(3):
+        text = text.replace(' . ', '. ')
+
+    return text
+
+
+class InputExample(object):
+    """A raw input example consisting of one or two segments of text and a label"""
+
+    def __init__(self,
+                 guid,
+                 text_a,
+                 text_b=None,
+                 label=None,
+                 logits=None,
+                 meta: Optional[Dict] = None,
+                 idx=-1,
+                 num_choices=1):
+        """
+        Create a new InputExample.
+
+        :param guid: a unique textual identifier
+        :param text_a: the sequence of text
+        :param text_b: an optional, second sequence of text
+        :param label: an optional label
+        :param logits: an optional list of per-class logits
+        :param meta: an optional dictionary to store arbitrary meta information
+        :param idx: an optional numeric index
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+        self.logits = logits
+        self.idx = idx
+        self.num_choices = num_choices
+        self.meta = meta if meta else {}
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serialize this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serialize this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
+
+    @staticmethod
+    def load_examples(path: str) -> List['InputExample']:
+        """Load a set of input examples from a file"""
+        with open(path, 'rb') as fh:
+            return pickle.load(fh)
+
+    @staticmethod
+    def save_examples(examples: List['InputExample'], path: str) -> None:
+        """Save a set of input examples to a file"""
+        with open(path, 'wb') as fh:
+            pickle.dump(examples, fh)
+
+
+def num_special_tokens_to_add(text_a_ids,
+                              text_b_ids,
+                              answer_ids,
+                              add_cls,
+                              add_sep,
+                              add_piece,
+                              add_eos=True):
+    num_tokens = 0
+    if add_cls:
+        num_tokens += 1
+    if text_b_ids and add_sep:
+        num_tokens += 1
+    if add_eos:
+        num_tokens += 1
+    if not answer_ids and add_piece:
+        num_tokens += 1
+    return num_tokens
+
+
+def build_input_from_ids(text_a_ids,
+                         text_b_ids,
+                         answer_ids,
+                         max_seq_length,
+                         tokenizer,
+                         args=None,
+                         add_cls=True,
+                         add_sep=False,
+                         add_piece=False,
+                         add_eos=True,
+                         mask_id=None):
+    if mask_id is None:
+        mask_id = tokenizer.get_command('MASK').Id
+    eos_id = tokenizer.get_command('eos').Id
+    cls_id = tokenizer.get_command('ENC').Id
+    sep_id = tokenizer.get_command('sep').Id
+    ids = []
+    types = []
+    paddings = []
+    # CLS
+    if add_cls:
+        ids.append(cls_id)
+        types.append(0)
+        paddings.append(1)
+    # A
+    len_text_a = len(text_a_ids)
+    ids.extend(text_a_ids)
+    types.extend([0] * len_text_a)
+    paddings.extend([1] * len_text_a)
+    # B
+    if text_b_ids is not None:
+        # SEP
+        if add_sep:
+            ids.append(sep_id)
+            types.append(0)
+            paddings.append(1)
+        len_text_b = len(text_b_ids)
+        ids.extend(text_b_ids)
+        types.extend([1] * len_text_b)
+        paddings.extend([1] * len_text_b)
+    eos_length = 1 if add_eos else 0
+    # Cap the size.
+    if len(ids) >= max_seq_length - eos_length:
+        max_seq_length_m1 = max_seq_length - 1
+        ids = ids[0:max_seq_length_m1]
+        types = types[0:max_seq_length_m1]
+        paddings = paddings[0:max_seq_length_m1]
+    end_type = 0 if text_b_ids is None else 1
+    if add_eos:
+        ids.append(eos_id)
+        types.append(end_type)
+        paddings.append(1)
+    sep = len(ids)
+    target_ids = [0] * len(ids)
+    loss_masks = [0] * len(ids)
+    position_ids = list(range(len(ids)))
+    block_position_ids = [0] * len(ids)
+    # Piece
+    if add_piece or answer_ids is not None:
+        sop_id = tokenizer.get_command('sop').Id
+        mask_position = ids.index(
+            mask_id
+        ) if not args.sentinel_token else args.max_position_embeddings
+        ids.append(sop_id)
+        types.append(end_type)
+        paddings.append(1)
+        position_ids.append(mask_position)
+        block_position_ids.append(1)
+        if answer_ids is not None:
+            len_answer = len(answer_ids)
+            ids.extend(answer_ids[:-1])
+            types.extend([end_type] * (len_answer - 1))
+            paddings.extend([1] * (len_answer - 1))
+            position_ids.extend([mask_position] * (len_answer - 1))
+            if not args.no_block_position:
+                block_position_ids.extend(range(2, len(answer_ids) + 1))
+            else:
+                block_position_ids.extend([1] * (len(answer_ids) - 1))
+            target_ids.extend(answer_ids)
+            loss_masks.extend([1] * len(answer_ids))
+        else:
+            target_ids.append(0)
+            loss_masks.append(1)
+    # Padding.
+    padding_length = max_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([eos_id] * padding_length)
+        types.extend([eos_id] * padding_length)
+        paddings.extend([0] * padding_length)
+        position_ids.extend([0] * padding_length)
+        block_position_ids.extend([0] * padding_length)
+        target_ids.extend([0] * padding_length)
+        loss_masks.extend([0] * padding_length)
+    if not args.masked_lm:
+        position_ids = [position_ids, block_position_ids]
+    return ids, types, paddings, position_ids, sep, target_ids, loss_masks
+
+
+def build_decoder_input(enc_ids, answer_ids, max_seq_length,
+                        max_dec_seq_length, tokenizer):
+    mask_id = tokenizer.get_command('MASK').Id
+    eos_id = tokenizer.get_command('eos').Id
+    sop_id = tokenizer.get_command('sop').Id
+    enc_len = len(enc_ids)  # noqa
+    masks = []
+    # TODO: it probably takes too much memory
+    # for i in range(max_dec_seq_length):
+    #     m = [1]*enc_len + [0]*(max_seq_length - enc_len) + [1]*(i+1) + [0]*(max_dec_seq_length-1-i)
+    #     masks.append(m)
+    mask_position = enc_ids.index(mask_id)
+    len_answer = len(answer_ids)
+    ids = [sop_id] + answer_ids[:-1]
+    types = [0] * len_answer  # not used
+    paddings = [1] * len_answer
+    position_ids = [mask_position] * len_answer
+    block_position_ids = list(range(1, len_answer + 1))
+    target_ids = answer_ids
+    loss_masks = [1] * len_answer
+    # Padding.
+    padding_length = max_dec_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([eos_id] * padding_length)
+        types.extend([0] * padding_length)
+        paddings.extend([0] * padding_length)
+        position_ids.extend([0] * padding_length)
+        block_position_ids.extend([0] * padding_length)
+        target_ids.extend([0] * padding_length)
+        loss_masks.extend([0] * padding_length)
+    position_ids = [position_ids, block_position_ids]
+    return ids, types, paddings, position_ids, masks, target_ids, loss_masks
+
+
+def build_sample(ids,
+                 types=None,
+                 paddings=None,
+                 positions=None,
+                 masks=None,
+                 label=None,
+                 unique_id=None,
+                 target=None,
+                 logit_mask=None,
+                 segment_ids=None,
+                 prompt_ids=None):
+    """Convert to numpy and return a sample consumed by the batch producer."""
+
+    ids_np = np.array(ids, dtype=np.int64)
+    sample = {'text': ids_np, 'label': int(label)}
+    if types is not None:
+        types_np = np.array(types, dtype=np.int64)
+        sample['types'] = types_np
+    if paddings is not None:
+        paddings_np = np.array(paddings, dtype=np.int64)
+        sample['padding_mask'] = paddings_np
+    if positions is not None:
+        positions_np = np.array(positions, dtype=np.int64)
+        sample['position'] = positions_np
+    if masks is not None:
+        masks_np = np.array(masks, dtype=np.int64)
+        sample['mask'] = masks_np
+    if target is not None:
+        target_np = np.array(target, dtype=np.int64)
+        sample['target'] = target_np
+    if logit_mask is not None:
+        logit_mask_np = np.array(logit_mask, dtype=np.int64)
+        sample['logit_mask'] = logit_mask_np
+    if segment_ids is not None:
+        segment_ids = np.array(segment_ids, dtype=np.int64)
+        sample['segment_id'] = segment_ids
+    if prompt_ids is not None:
+        prompt_ids = np.array(prompt_ids, dtype=np.int64)
+        sample['prompt_pos'] = prompt_ids
+    if unique_id is not None:
+        sample['uid'] = unique_id
+    return sample
+
+
+def build_decoder_sample(sample, dec_ids, dec_position, dec_masks, dec_target,
+                         dec_logit_mask):
+    sample['dec_text'] = np.array(dec_ids)
+    sample['dec_position'] = np.array(dec_position)
+    sample['dec_mask'] = np.array(dec_masks)
+    sample['dec_target'] = np.array(dec_target)
+    sample['dec_logit_mask'] = np.array(dec_logit_mask)
+    return sample
+
+
+def my_collate(batch):
+    new_batch = [{key: value
+                  for key, value in sample.items() if key != 'uid'}
+                 for sample in batch]
+    text_list = [sample['text'] for sample in batch]
+
+    def pad_choice_dim(data, choice_num):
+        if len(data) < choice_num:
+            data = np.concatenate([data]
+                                  + [data[0:1]] * (choice_num - len(data)))
+        return data
+
+    if len(text_list[0].shape) == 2:
+        choice_nums = list(map(len, text_list))
+        max_choice_num = max(choice_nums)
+        for i, sample in enumerate(new_batch):
+            for key, value in sample.items():
+                if key != 'label':
+                    sample[key] = pad_choice_dim(value, max_choice_num)
+                else:
+                    sample[key] = value
+            sample['loss_mask'] = np.array(
+                [1] * choice_nums[i] + [0] * (max_choice_num - choice_nums[i]),
+                dtype=np.int64)
+
+    if 'dec_text' in new_batch[0]:
+        choice_nums = [len(sample['dec_text']) for sample in new_batch]
+        if choice_nums.count(choice_nums[0]) != len(choice_nums):
+            max_choice_num = max(choice_nums)
+            for i, sample in enumerate(new_batch):
+                for key, value in sample.items():
+                    if key.startswith('dec_'):
+                        sample[key] = pad_choice_dim(value, max_choice_num)
+                sample['loss_mask'] = np.array(
+                    [1] * choice_nums[i] + [0] *  # noqa
+                    (max_choice_num - choice_nums[i]),
+                    dtype=np.int64)
+
+    new_batch = default_collate(new_batch)
+    if 'uid' in batch[0]:
+        uid_list = [sample['uid'] for sample in batch]
+        new_batch['uid'] = uid_list
+    return new_batch
+
+
+class FakeDataloader:
+
+    def __init__(self, num_iters):
+        self.num_iters = num_iters
+
+    def __iter__(self):
+        if self.num_iters is not None:
+            for _ in range(self.num_iters):
+                yield None
+        else:
+            while True:
+                yield None
+
+
+def build_data_loader(dataset,
+                      batch_size,
+                      num_workers,
+                      drop_last,
+                      shuffle=True,
+                      only_rank0=False):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+
+    # Sampler.
+    if only_rank0:
+        rank, world_size = 0, 1
+    else:
+        world_size = mpu.get_data_parallel_world_size()
+        rank = mpu.get_data_parallel_rank()
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank, shuffle=shuffle)
+
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        shuffle=False,
+        num_workers=num_workers,
+        drop_last=drop_last,
+        pin_memory=True,
+        collate_fn=my_collate)
+
+    return data_loader
--- a/modelscope/models/nlp/mglm/tasks/eval_utils.py
+++ b/modelscope/models/nlp/mglm/tasks/eval_utils.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation utilities."""
+
+import datetime
+import os
+import random
+import time
+from collections import OrderedDict
+from typing import List
+
+import mpu
+import torch
+from finetune_glm import process_batch
+from sklearn.metrics import f1_score
+from tasks.data_utils import InputExample, build_data_loader
+from utils import debug_finetune_data, get_spare_port, print_rank_0
+
+
+def accuracy_metric(predictions, labels, examples):
+    count = 0
+    num_predictions = max(len(predictions), 1)
+    assert len(predictions) == len(labels)
+    for prediction, label in zip(predictions, labels):
+        count += prediction == label
+    return count * 100.0 / num_predictions
+
+
+def f1_metric(predictions, labels, examples):
+    return f1_score(labels, predictions)
+
+
+def f1_macro_metric(predictions, labels, examples):
+    return f1_score(labels, predictions, average='macro')
+
+
+global_tokenizer = None
+
+
+def accuracy_func_provider(single_dataset_provider,
+                           metric_dict,
+                           args,
+                           is_test=False,
+                           eval_func=None,
+                           output_func=None,
+                           only_rank0=True,
+                           tokenizer=None):
+    """Provide function that calculates accuracies."""
+    # Build dataloaders.
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    if only_rank0 and torch.distributed.is_initialized(
+    ) and torch.distributed.get_rank() != 0:
+        return None
+    if is_test and not args.eval_valid:
+        datapaths = args.test_data if args.test_data is not None else ['test']
+    else:
+        datapaths = args.valid_data if args.valid_data is not None else ['dev']
+    if eval_func is None:
+        eval_func = multichoice_evaluate
+    dataloaders = []
+    eval_batch_size = args.eval_batch_size if args.eval_batch_size else args.batch_size
+    for datapath in datapaths:
+        dataset = single_dataset_provider(datapath)
+        dataloader = build_data_loader(
+            dataset,
+            eval_batch_size,
+            num_workers=args.num_workers,
+            drop_last=False,
+            shuffle=False,
+            only_rank0=only_rank0)
+        dataloaders.append((dataset.dataset_name, dataloader))
+
+    def metrics_func(model,
+                     epoch,
+                     output_predictions=False,
+                     summary_writer=None):
+        print_rank_0('calculating metrics ...')
+        score_dict = OrderedDict([(key, 0.0) for key in metric_dict
+                                  ]) if isinstance(metric_dict, dict) else {
+                                      metric_dict: 0.0
+                                  }  # noqa
+        total = 0
+        for name, dataloader in dataloaders:
+            example_dict = None
+            if hasattr(dataloader.dataset, 'examples'):
+                example_dict = dataloader.dataset.examples
+            start_time = time.time()
+            predictions, labels, examples = eval_func(model, dataloader,
+                                                      example_dict, args)
+            elapsed_time = time.time() - start_time
+            if output_predictions and torch.distributed.get_rank() == 0:
+                filename = os.path.join(args.log_dir, name + '.jsonl')
+                output_func(predictions, examples, filename)
+            total_count = len(predictions)
+            single_dict = {
+                key: metric(predictions, labels, examples)
+                for key, metric in metric_dict.items()
+            }
+            output_str = ' > |epoch: {}| metrics for {}: total {}'.format(
+                epoch, name, total_count)
+            for key, value in single_dict.items():
+                output_str += ' {} = {:.4f} %'.format(key, value)
+                if summary_writer is not None and epoch >= 0 and not is_test and len(
+                        dataloaders) > 1:
+                    summary_writer.add_scalar(f'Train/valid_{name}_{key}',
+                                              value, epoch)
+            output_str += ' elapsed time (sec): {:.3f}'.format(elapsed_time)
+            if len(dataloaders) > 1:
+                print_rank_0(output_str)
+            for key in score_dict:
+                score_dict[key] += single_dict[key] * total_count
+            total += total_count
+        score_dict = {
+            key: score / float(total)
+            for key, score in score_dict.items()
+        }
+        output_str = ' >> |epoch: {}| overall: total = {}'.format(epoch, total)
+        for key, score in score_dict.items():
+            output_str += ' {} = {:.4f}'.format(key, score)
+            if summary_writer is not None and epoch >= 0 and not is_test:
+                summary_writer.add_scalar(f'Train/valid_{key}', score, epoch)
+        print_rank_0(output_str)
+        return score_dict
+
+    return metrics_func
+
+
+segment_length = 10
+
+
+def multichoice_evaluate(model, dataloader, example_dict, args):
+    """Calculate correct over total answers and return prediction if the
+    `output_predictions` is true."""
+    model.eval()
+    port = get_spare_port(args)
+    print_rank_0(f'Using port {port}')
+    store = torch.distributed.TCPStore(args.master_ip, port,
+                                       torch.distributed.get_world_size(),
+                                       torch.distributed.get_rank() == 0,
+                                       datetime.timedelta(seconds=30))
+    # file_path = os.path.join("/cache", args.experiment_name + "_store")
+    # print_rank_0(f"Using file store at {file_path}")
+    # store = torch.distributed.FileStore(file_path, torch.distributed.get_world_size())
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for _, batch in enumerate(dataloader):
+            # Run the model forward.
+            data = process_batch(batch, args)
+            if args.pretrained_bert:
+                tokens, types, labels_, attention_mask = data['text'], data[
+                    'types'], data['label'], data['padding_mask']
+                inputs = [tokens, types, attention_mask]
+            elif args.cloze_eval:
+                tokens, labels_, position_ids = data['text'], data[
+                    'label'], data['position']
+                attention_mask, target_ids, logit_mask = data['mask'], data[
+                    'target'], data['logit_mask']
+                if not args.fast_decode:
+                    inputs = [
+                        tokens, position_ids, attention_mask, target_ids,
+                        logit_mask
+                    ]
+                    if args.continuous_prompt:
+                        prompt_pos = data['prompt_pos']
+                        inputs.append(prompt_pos)
+                else:
+                    dec_input_ids, dec_position_ids, dec_attention_mask = data[
+                        'dec_text'], data['dec_position'], data['dec_mask']
+                    dec_target_ids, dec_logit_mask = data['dec_target'], data[
+                        'dec_logit_mask']
+                    inputs = [
+                        tokens, position_ids, attention_mask, dec_input_ids,
+                        dec_position_ids, dec_attention_mask, dec_target_ids,
+                        dec_logit_mask
+                    ]
+            else:
+                tokens, labels_, position_ids, attention_mask = data[
+                    'text'], data['label'], data['position'], data['mask']
+                inputs = [tokens, position_ids, attention_mask]
+            if len(inputs[0].shape
+                   ) == 3 and inputs[0].size(1) > segment_length:
+                logit_list = []
+                for i in range((inputs[0].size(1) - 1) // segment_length + 1):
+                    input_batch = [
+                        arg[:, i * segment_length:(i + 1) * segment_length]
+                        for arg in inputs
+                    ]
+                    if args.pretrained_bert:
+                        logits = model(*input_batch)
+                    else:
+                        logits, *mems = model(*input_batch)
+                    logit_list.append(logits)
+                logits = torch.cat(logit_list, dim=1)
+            elif args.cloze_eval and args.fast_decode:
+                logit_list = []
+                num_choices = inputs[3].size(1)
+                for i in range((num_choices - 1) // segment_length + 1):
+                    input_batch = inputs[:3] + [
+                        arg[:, i * segment_length:(i + 1) * segment_length]
+                        for arg in inputs[3:]
+                    ]
+                    logits, *mems = model(*input_batch)
+                    logit_list.append(logits)
+                logits = torch.cat(logit_list, dim=1)
+            else:
+                if args.pretrained_bert:
+                    logits = model(*inputs)
+                else:
+                    logits, *mems = model(*inputs)
+            if 'segment_id' in data:
+                from torch_scatter import scatter_sum
+                if 'loss_mask' in data:
+                    logits = logits * data['loss_mask']
+                logits = scatter_sum(logits, data['segment_id'], dim=1)
+            elif 'loss_mask' in data:
+                loss_mask = data['loss_mask']
+                logits = logits * loss_mask - 10000.0 * (1.0 - loss_mask)
+            uid_list = batch['uid']
+            if isinstance(uid_list, torch.Tensor):
+                uid_list = uid_list.cpu().numpy().tolist()
+            predicted = torch.argmax(logits, dim=-1).tolist()
+            labels = labels_.tolist()
+            if args.task.lower() == 'wsc':
+                predicted = [1 if pred == 0 else 0 for pred in predicted]
+            if mpu.get_model_parallel_rank() == 0:
+                for uid, prediction, label in zip(uid_list, predicted, labels):
+                    store.set(uid, str((prediction, label)))
+    model.train()
+    torch.distributed.barrier()
+    predictions, labels, examples = [], [], []
+    for uid, example in example_dict.items():
+        prediction, label = eval(store.get(uid))
+        predictions.append(prediction)
+        labels.append(label)
+        examples.append(example)
+    torch.distributed.barrier()
+    return predictions, labels, examples
--- a/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
+++ b/modelscope/models/nlp/mglm/tasks/language_model/dataset.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import math
+from bisect import bisect_right
+from itertools import accumulate
+
+import json
+import numpy as np
+import torch
+from tasks.data_utils import build_input_from_ids, num_special_tokens_to_add
+from tasks.language_model.detokenizer import get_detokenizer
+from utils import print_rank_0
+
+
+class LMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, documents, tokenizer, num_original_tokens,
+                 num_tokenized_tokens):
+        self.args = args
+        self.documents = documents
+        self.max_seq_len = args.seq_length - 1
+        self.tokenizer = tokenizer
+        self.overalapping_eval = args.overlapping_eval
+        if self.overalapping_eval is None:
+            self.overalapping_eval = self.max_seq_len
+        self.overalapping_eval = max(1, self.overalapping_eval)
+        self.num_original_tokens = num_original_tokens
+        self.num_tokenized_tokens = num_tokenized_tokens
+        # remove first sequence tokens
+        targets = [
+            max(len(tokens) - self.max_seq_len, 0) for tokens in self.documents
+        ]
+        self.num_sequences = [
+            max(math.ceil(target / self.overalapping_eval) + 1, 1)
+            for target in targets
+        ]
+        self.weights = list(accumulate(self.num_sequences))
+        self.left_weights = [0] + self.weights[:-1]
+        self.unidirectional = args.unidirectional
+        self.block_lm = args.block_lm
+        mask_token = 'gMASK' if args.task_mask else 'MASK'
+        self.mask_id = self.tokenizer.get_command(mask_token).Id
+
+    def __len__(self):
+        return sum(self.num_sequences)
+
+    def __getitem__(self, idx):
+        document_idx = bisect_right(self.weights, idx)
+        idx = idx - self.left_weights[document_idx]
+        start_idx = idx * self.overalapping_eval
+        end_idx = start_idx + self.max_seq_len
+        tokens = self.documents[document_idx][start_idx:end_idx]
+        if self.block_lm:
+            if idx == 0 or self.unidirectional:
+                prompt, text = tokens[:1], tokens[1:]
+            else:
+                prompt_length = self.max_seq_len - self.overalapping_eval
+                prompt, text = tokens[:prompt_length], tokens[prompt_length:]
+            prompt = prompt + [self.mask_id]
+            num_special_tokens = num_special_tokens_to_add(
+                prompt,
+                None,
+                text,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                add_eos=False)
+            data = build_input_from_ids(
+                prompt,
+                None,
+                text,
+                self.max_seq_len + num_special_tokens + 1,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                add_eos=False,
+                mask_id=self.mask_id)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if idx != 0 and self.unidirectional:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                loss_masks[:-self.overalapping_eval] = 0
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64)
+            }
+        else:
+            loss_masks = [1] * len(tokens)
+            if len(tokens) < self.max_seq_len:
+                tokens = tokens + [0] * (self.max_seq_len - len(tokens))
+                loss_masks = loss_masks + [0] * (
+                    self.max_seq_len - len(loss_masks))
+            if idx != 0:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                loss_masks[:-self.overalapping_eval] = 0
+            return {
+                'text': np.array(tokens, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64)
+            }
+
+
+class LambadaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, tokenizer, strict=True):
+        data_path = args.valid_data[0]
+        print_rank_0(
+            '> building lambada dataset from {} ...'.format(data_path))
+        self.args = args
+        self.max_seq_length = args.seq_length
+        self.tokenizer = tokenizer
+        self.pad_idx = tokenizer.get_command('pad').Id
+        self.strict = strict
+        self.block_lm = args.block_lm
+        self.unidirectional = args.unidirectional
+        mask_token = 'gMASK' if args.task_mask else 'MASK'
+        self.mask_id = self.tokenizer.get_command(mask_token).Id
+
+        self.tokens = []
+        self.labels = []
+        with open(data_path, 'r') as f:
+            for line in f.readlines():
+                text = json.loads(line)['text']
+                tokens, labels = self.get_tokens(text)
+                self.tokens.append(tokens)
+                self.labels.append(labels)
+
+    def get_tokens(self, text):
+        if not self.strict:
+            tokens = self.tokenizer.EncodeAsIds(text).tokenization
+            return tokens[:-1], [tokens[-1]]
+        last_token = text.split()[-1]
+        start_idx = text.rfind(last_token)
+        beginning_tokens = self.tokenizer.EncodeAsIds(
+            text[:start_idx].strip()).tokenization
+        last_token = self.tokenizer.EncodeAsIds(' ' + last_token).tokenization
+        return beginning_tokens, last_token
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def __getitem__(self, idx):
+        tokens, answer = self.tokens[idx], self.labels[idx]
+        if self.block_lm:
+            if self.unidirectional:
+                tokens, answer_tokens = tokens[:1], tokens[1:] + answer
+            else:
+                answer_tokens = answer
+            tokens = tokens + [self.mask_id]
+            num_special_tokens = num_special_tokens_to_add(
+                tokens,
+                None,
+                answer_tokens,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True)
+            left_shift = len(tokens) + len(
+                answer_tokens) + num_special_tokens - self.max_seq_length
+            if left_shift > 0:
+                tokens = tokens[left_shift:]
+            data = build_input_from_ids(
+                tokens,
+                None,
+                answer_tokens,
+                self.max_seq_length,
+                self.tokenizer,
+                args=self.args,
+                add_cls=True,
+                add_sep=False,
+                add_piece=True,
+                mask_id=self.mask_id)
+            ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
+            if self.unidirectional:
+                loss_masks = np.array(loss_masks, dtype=np.int64)
+                last_index = len(loss_masks)
+                while loss_masks[last_index - 1] == 0:
+                    last_index -= 1
+                loss_masks[:last_index - len(answer)] = 0
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64)
+            }
+        else:
+            left_shift = len(tokens) - self.max_seq_length
+            if left_shift > 0:
+                tokens = tokens[left_shift:]
+            ids = tokens + answer
+            if len(ids) < self.max_seq_length:
+                ids = ids + [0] * (self.max_seq_length - len(ids))
+            loss_masks = [0] * len(tokens) + [1] * len(answer)
+            if len(loss_masks) < self.max_seq_length:
+                loss_masks = loss_masks + [0] * (
+                    self.max_seq_length - len(loss_masks))
+            return {
+                'text': np.array(ids, dtype=np.int64),
+                'loss_mask': np.array(loss_masks, dtype=np.int64)
+            }
+
+
+def build_lambada_dataset(tokenizer, args):
+    """Build lambada dataset."""
+    assert len(args.valid_data) == 1
+    val_dataset = LambadaDataset(args, tokenizer, strict=True)
+    print_rank_0(' > found {} samples, {} label tokens.'.format(
+        len(val_dataset), sum(map(len, val_dataset.labels))))
+    return val_dataset
+
+
+def build_lm_dataset(tokenizer, args):
+    documents = []
+    num_tokens, num_original_tokens = 0, 0
+    with open(args.valid_data[0], encoding='utf-8') as file:
+        for line in file:
+            tokens = tokenizer.EncodeAsIds(line.strip()).tokenization
+            num_tokens += len(tokens)
+            num_original_tokens += len(line.strip().split(' '))
+            documents.append(tokens)
+    val_dataset = LMDataset(args, documents, tokenizer, num_original_tokens,
+                            num_tokens)
+    print_rank_0(
+        ' > number of document: {}, number of original tokens {}, number of detokenized tokens: {}'
+        .format(len(documents), num_original_tokens, num_tokens))
+    return val_dataset
+
+
+def build_wikitext103_dataset(tokenizer, args):
+    """"""
+
+    assert len(args.valid_data) == 1
+    with open(args.valid_data[0], 'rb') as reader:
+        entire_data = reader.read().decode('utf-8')
+    num_original_tokens = len(entire_data.strip().split(' '))
+    entire_data = get_detokenizer('wikitext')(entire_data)
+    print_rank_0(entire_data[:1024])
+    tokenized_data = tokenizer.EncodeAsIds(entire_data).tokenization
+    num_tokenized_tokens = len(tokenized_data)
+
+    val_dataset = LMDataset(args, [tokenized_data], tokenizer,
+                            num_original_tokens, num_tokenized_tokens)
+    print_rank_0(' > number of original tokens: {}, number of detokenized '
+                 'tokens: {}'.format(num_original_tokens,
+                                     num_tokenized_tokens))
+    return val_dataset
--- a/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py
+++ b/modelscope/models/nlp/mglm/tasks/language_model/detokenizer.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import re
+
+
+def ptb_detokenizer(string):
+    string = string.replace(" '", "'")
+    string = string.replace(' \n', '\n')
+    string = string.replace('\n ', '\n')
+    string = string.replace(" n't", "n't")
+    string = string.replace(' N ', '1 ')
+    string = string.replace('$ 1', '$1')
+    string = string.replace('# 1', '#1')
+    return string
+
+
+def wikitext_detokenizer(string):
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(' @-@ ', '-')
+    string = string.replace(' @,@ ', ',')
+    string = string.replace(' @.@ ', '.')
+    # punctuation
+    string = string.replace(' : ', ': ')
+    string = string.replace(' ; ', '; ')
+    string = string.replace(' . ', '. ')
+    string = string.replace(' ! ', '! ')
+    string = string.replace(' ? ', '? ')
+    string = string.replace(' , ', ', ')
+    # double brackets
+    string = re.sub(r'\(\s*([^\)]*?)\s*\)', r'(\1)', string)
+    string = re.sub(r'\[\s*([^\]]*?)\s*\]', r'[\1]', string)
+    string = re.sub(r'{\s*([^}]*?)\s*}', r'{\1}', string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace('= = = =', '====')
+    string = string.replace('= = =', '===')
+    string = string.replace('= =', '==')
+    string = string.replace(' ' + chr(176) + ' ', chr(176))
+    string = string.replace(' \n', '\n')
+    string = string.replace('\n ', '\n')
+    string = string.replace(' N ', ' 1 ')
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def lambada_detokenizer(string):
+    return string
+
+
+def get_detokenizer(dataset):
+    return DETOKENIZERS[dataset]
+
+
+DETOKENIZERS = {
+    'ptb': ptb_detokenizer,
+    'wikitext': wikitext_detokenizer,
+    'lambada': lambada_detokenizer,
+}
--- a/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
+++ b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT2 zero-shot evaluation."""
+
+import functools
+import math
+
+import mpu
+import torch
+from finetune_glm import finetune
+from pretrain_glm import get_batch
+from tasks.data_utils import build_data_loader
+from tasks.language_model.dataset import (build_lambada_dataset,
+                                          build_lm_dataset,
+                                          build_wikitext103_dataset)
+from utils import print_rank_0
+
+global_tokenizer = None
+
+
+def lm_forward_step(data, model, args, timers, mems, eval_metric=None):
+    """Forward step."""
+
+    # Get the batch.
+    if timers is not None:
+        timers('batch generator').start()
+    if 'mask' in data:
+        data['attention_mask'] = data.pop('mask')
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data, args)
+    if timers is not None:
+        timers('batch generator').stop()
+
+    def print_masked_text(batch_id):
+        block_position_ids = position_ids[:, 1]
+        position_ids_ = position_ids[:, 0]
+        output_tokens = []
+        sep = attention_mask[batch_id].item()
+        for i, token in enumerate(tokens[batch_id, :sep].tolist()):
+            if global_tokenizer is not None:
+                token = global_tokenizer.IdToToken(token)
+                if token.startswith('[MASK'):
+                    token = f'[{position_ids_[batch_id, i].item()}, {token}]'
+                if token.startswith('##') and len(
+                        output_tokens) > 0 and not output_tokens[-1].endswith(
+                            ']'):
+                    output_tokens[-1] += token[2:]
+                else:
+                    output_tokens.append(token)
+            else:
+                output_tokens.append(str(token))
+        print(' '.join(output_tokens))
+        last_index = None
+        for i in range(sep, tokens.size(1)):
+            if global_tokenizer.IdToToken(
+                    tokens[batch_id, i].item()).startswith('<|startofpiece'):
+                if last_index is not None:
+                    print(
+                        global_tokenizer.DecodeIds(
+                            tokens[batch_id, last_index:i].tolist()), '|',
+                        global_tokenizer.DecodeIds(
+                            labels[batch_id, last_index:i].tolist())),
+                    print(position_ids_[batch_id, last_index:i].tolist(),
+                          block_position_ids[batch_id, last_index:i].tolist())
+                last_index = i
+        if last_index is not None:
+            print(
+                global_tokenizer.DecodeIds(tokens[batch_id,
+                                                  last_index:].tolist()), '|',
+                global_tokenizer.DecodeIds(labels[batch_id,
+                                                  last_index:].tolist()))
+            print(position_ids_[batch_id, last_index:].tolist(),
+                  block_position_ids[batch_id, last_index:].tolist())
+
+    # Forward model.
+    if args.continuous_prompt:
+        prompt_pos = data['prompt_pos'].long().cuda()
+        logits, *mems = model(
+            tokens, position_ids, attention_mask, *mems, prompt_pos=prompt_pos)
+    else:
+        logits, *mems = model(tokens, position_ids, attention_mask, *mems)
+
+    if eval_metric is None or eval_metric == 'loss':
+        losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(),
+                                                  labels)
+        loss_mask = loss_mask.view(-1)
+        # The loss is not normalized for fair comparison
+        loss = torch.sum(losses.view(-1) * loss_mask)
+        if eval_metric is None:
+            loss = loss / loss_mask.sum()
+        return loss, mems, 'bert'
+    elif eval_metric == 'accuracy' or eval_metric == 'classify':
+        logits = mpu.gather_from_model_parallel_region(logits)
+        outputs = torch.argmax(logits, -1)
+        correct = (outputs == labels).float()
+        correct[(1 - loss_mask).bool()] = 1
+        correct = correct.prod(-1)
+        if eval_metric == 'accuracy':
+            correct = correct.sum()
+        return correct, mems, 'bert'
+    else:
+        raise NotImplementedError(
+            'Metric {} not implemented'.format(eval_metric))
+
+
+def classify_evaluate(model, dataloader, example_dict, args):
+    """Evaluation."""
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    predictions, labels, examples = [], [], []
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(dataloader):
+            # Forward evaluation.
+            output, _, _ = lm_forward_step(
+                batch, model, args, None, [], eval_metric='classify')
+            uid_list = batch['uid']
+            example_batch = [example_dict[uid] for uid in uid_list]
+            predictions.extend(output.long().tolist())
+            label = batch['label'].tolist()
+            labels.extend(label)
+            examples.extend(example_batch)
+    return predictions, labels, examples
+
+
+def evaluate(model, dataloader, eval_metric, args):
+    """Evaluation."""
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    total_output, total_count = 0.0, 0
+    total_tokens = 0
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(dataloader):
+            if (iteration + 1) % args.log_interval == 0:
+                print_rank_0('> working on iteration: {}'.format(iteration))
+            # Forward evaluation.
+            output, _, _ = lm_forward_step(
+                batch, model, args, None, [], eval_metric=eval_metric)
+            count = batch['text'].size(0)
+            count = torch.cuda.LongTensor([count])
+            # Reduce across processes.
+            torch.distributed.all_reduce(
+                output, group=mpu.get_data_parallel_group())
+            torch.distributed.all_reduce(
+                count, group=mpu.get_data_parallel_group())
+
+            total_output += output.item()
+            total_count += count.item()
+            total_tokens += batch['loss_mask'].sum().item()
+    totals = torch.cuda.FloatTensor([total_output, total_tokens])
+    torch.distributed.all_reduce(totals, group=mpu.get_data_parallel_group())
+    total_output, total_tokens = totals.tolist()
+    print(total_tokens)
+    return {eval_metric: total_output}, total_count
+
+
+def evaluate_and_print_results(data_loader, model, eval_metric, args):
+    """Evaluate and print results on screen."""
+
+    # Evaluate and get results.
+    output, _ = evaluate(model, data_loader, eval_metric, args)
+
+    string = ''
+    if eval_metric == 'loss':
+        output = output['loss']
+        num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
+        num_original_tokens = data_loader.dataset.num_original_tokens
+        val_loss = output / (num_tokenized_tokens - 1)
+        ppl = math.exp(min(20, val_loss))
+        token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
+        adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
+        string += 'avg loss: {:.4E} | '.format(val_loss)
+        string += 'ppl: {:.4E} | '.format(ppl)
+        string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
+        string += 'token ratio: {} |'.format(token_ratio)
+        score_dict = {
+            'avg loss': val_loss,
+            'ppl': ppl,
+            'adjusted ppl': adjusted_ppl
+        }
+
+    elif eval_metric == 'accuracy':
+        output = output['accuracy']
+        num_examples = len(data_loader.dataset)
+        acc = output / num_examples * 100
+        string += 'number correct: {} | '.format(output)
+        string += 'total examples: {} | '.format(num_examples)
+        string += 'avg accuracy: {:.2f}'.format(acc)
+        score_dict = {'accuracy': acc}
+    else:
+        raise NotImplementedError('evaluation method for {} metric is not '
+                                  'implemented yet.'.format(eval_metric))
+
+    length = len(string) + 1
+    print_rank_0('-' * length)
+    print_rank_0(string)
+    print_rank_0('-' * length)
+    return score_dict
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Privde metrics callback function."""
+
+    if args.task.lower() == 'lambda':
+        eval_metric = 'accuracy'
+        dataset = build_lambada_dataset(tokenizer, args)
+    elif args.task == 'wikitext':
+        eval_metric = 'loss'
+        dataset = build_wikitext103_dataset(tokenizer, args)
+    elif args.task == 'language_model':
+        eval_metric = 'loss'
+        dataset = build_lm_dataset(tokenizer, args)
+    else:
+        raise NotImplementedError('{} task is not implemented.'.format(
+            args.task))
+    # Data stuff
+    dataloader = build_data_loader(
+        dataset,
+        args.eval_batch_size,
+        args.num_workers,
+        drop_last=False,
+        shuffle=False)
+
+    def metrics_func(model,
+                     epoch,
+                     output_predictions=False,
+                     summary_writer=None):
+        return evaluate_and_print_results(
+            dataloader, model, eval_metric=eval_metric, args=args)
+
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    return metrics_func
+
+
+def main(args):
+    """Main program."""
+    finetune(
+        args,
+        None, {},
+        end_of_epoch_callback_provider=metrics_func_provider,
+        forward_step=lm_forward_step)
--- a/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/dataset.py
@@ -0,0 +1,667 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os
+import random
+
+import json
+import numpy as np
+import torch
+import torch.utils.data
+from data_utils.corpora import punctuation_standardization
+from tasks.data_utils import InputExample
+from tqdm import tqdm
+from utils import print_rank_0
+
+
+def gigaword_detokenize(string, is_target=False):
+    _tok_dict = {
+        '(': '-lrb-',
+        ')': '-rrb-',
+        '[': '-lsb-',
+        ']': '-rsb-',
+        '{': '-lcb-',
+        '}': '-rcb-',
+        '&': '&amp;',
+        '<': '&lt;',
+        '>': '&gt;'
+    }
+    string = string.replace('UNK', '[UNK]')
+    string = string.replace('<unk>', '[UNK]')
+    for key, value in _tok_dict.items():
+        string = string.replace(value, key)
+    # string = string.replace("''", "\"")
+    # string = string.replace("``", "\"")
+    # string = string.replace("`", "'")
+    # string = string.replace(" n't", "n't")
+    # string = string.replace(" 's", "'s")
+    # string = string.replace(" 'd", "'d")
+    # string = string.replace(" 'll", "'ll")
+    return string
+
+
+def cnndm_detokenize(string, is_target=False):
+    _tok_dict = {
+        '(': '-LRB-',
+        ')': '-RRB-',
+        '[': '-LSB-',
+        ']': '-RSB-',
+        '{': '-LCB-',
+        '}': '-RCB-'
+    }
+    if not is_target:
+        string = string.replace('<S_SEP>', '')
+    else:
+        string = string.replace('<S_SEP>', '[SEP]')
+    for key, value in _tok_dict.items():
+        string = string.replace(value, key)
+    string = string.replace("''", "\"")
+    string = string.replace('``', "\"")
+    string = string.replace('`', "'")
+    string = string.replace(" n't", "n't")
+    string = string.replace(" 's", "'s")
+    string = string.replace(" 'd", "'d")
+    string = string.replace(" 'll", "'ll")
+    return string
+
+
+def blanklm_detokenize(string, is_target=False):
+    string = string.replace('_UNK', '[UNK]')
+    string = string.replace('<blank>', '[MASK]')
+    return string
+
+
+class SummmaryProcessor:
+
+    def __init__(self, task, data_dir, tokenizer):
+        self.task = task
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'val'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(
+            f'Creating {self.task}-{split} dataset from {self.data_dir}')
+        if self.task == 'gigaword':
+            detokenizer = gigaword_detokenize
+        elif self.task == 'cnn_dm':
+            detokenizer = cnndm_detokenize
+        else:
+            detokenizer = None
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(self.data_dir, f'{filename}.source'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = punctuation_standardization(line)
+                line = detokenizer(line) if detokenizer else line
+                source_texts.append(line)
+        with open(
+                os.path.join(self.data_dir, f'{filename}.target'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = punctuation_standardization(line)
+                line = detokenizer(
+                    line, is_target=True) if detokenizer else line
+                target_texts.append(line)
+        assert len(source_texts) == len(target_texts)
+        example_list = []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {
+                'ref':
+                self.tokenizer.DecodeIds(
+                    self.tokenizer.EncodeAsIds(target_text).tokenization)
+            }
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            if idx < 10:
+                print_rank_0(
+                    (source_text.encode('utf-8'), target_text.encode('utf-8'),
+                     meta['ref'].encode('utf-8')))
+            example_list.append(example)
+        return example_list
+
+
+class SQuADProcessor:
+
+    def __init__(self, data_dir, tokenizer):
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            filename = 'train.json'
+        elif split == 'dev':
+            filename = 'dev.json'
+        elif split == 'test':
+            filename = 'test.json'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating SQuAD-{split} dataset from {self.data_dir}')
+        example_list = []
+        idx = 0
+        with open(
+                os.path.join(self.data_dir, filename),
+                encoding='utf-8') as file:
+            dataset = json.load(file)
+            for paragraphs in dataset:
+                for paragraph in paragraphs['paragraphs']:
+                    context = paragraph['context']
+                    for qa in paragraph['qas']:
+                        question = qa['question']
+                        answers = {answer['text'] for answer in qa['answers']}
+                        answer_starts = {
+                            answer['text']: answer['answer_start']
+                            for answer in qa['answers']
+                        }
+                        for answer in answers:
+                            guid = '%s-%s' % (split, idx)
+                            meta = {
+                                'answer_start':
+                                answer_starts[answer],
+                                'answer':
+                                answer,
+                                'question':
+                                question,
+                                'ref':
+                                self.tokenizer.DecodeIds(
+                                    self.tokenizer.EncodeAsIds(
+                                        question).tokenization)
+                            }
+                            example = InputExample(
+                                guid=guid, text_a=context, meta=meta)
+                            if idx < 10:
+                                print_rank_0((context.encode('utf-8'),
+                                              answer.encode('utf-8'),
+                                              meta['ref'].encode('utf-8')))
+                            example_list.append(example)
+                            idx += 1
+        print_rank_0(f'Creating {len(example_list)} examples for {split}')
+        return example_list
+
+
+class XSumProcessor:
+
+    def __init__(self, data_dir, tokenizer):
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+
+    def create_examples(self, split):
+        if split == 'train':
+            key = 'train'
+        elif split == 'dev':
+            key = 'validation'
+        elif split == 'test':
+            key = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating XSUM-{split} dataset from {self.data_dir}')
+        with open(
+                os.path.join(
+                    self.data_dir,
+                    'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json')) as file:
+            id_list = json.load(file)
+        id_list = id_list[key]
+        source_texts, target_texts = [], []
+        for i, idx in enumerate(id_list):
+            with open(os.path.join(self.data_dir, f'{idx}.summary')) as file:
+                key, sentences = None, []
+                source_text, target_text = None, None
+                for line in file:
+                    line = line.strip()
+                    if line.startswith('[SN]'):
+                        if key is not None:
+                            if key == 'RESTBODY':
+                                source_text = ' '.join(sentences)
+                            elif key == 'FIRST-SENTENCE':
+                                target_text = ' '.join(sentences)
+                        key = line[4:-4]
+                        sentences = []
+                    elif line:
+                        sentences.append(line)
+                if key is not None:
+                    if key == 'RESTBODY':
+                        source_text = ' '.join(sentences)
+                    elif key == 'FIRST-SENTENCE':
+                        target_text = ' '.join(sentences)
+                source_texts.append(source_text)
+                target_texts.append(target_text)
+                if (i + 1) % 1000 == 0:
+                    print_rank_0(f'Complete {i + 1} examples')
+        assert len(source_texts) == len(target_texts)
+        example_list = []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {
+                'ref':
+                self.tokenizer.DecodeIds(
+                    self.tokenizer.EncodeAsIds(target_text).tokenization)
+            }
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            if idx < 10:
+                print_rank_0(
+                    (source_text.encode('utf-8'), target_text.encode('utf-8'),
+                     meta['ref'].encode('utf-8')))
+            example_list.append(example)
+        return example_list
+
+
+class Seq2SeqDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        self.task, self.data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        self.tokenizer = tokenizer
+        self.dataset_name = split
+        if self.task in ['gigaword', 'cnn_dm', 'cnn_dm_original']:
+            self.processor = SummmaryProcessor(self.task, self.data_dir,
+                                               tokenizer)
+        elif self.task in ['xsum']:
+            self.processor = XSumProcessor(self.data_dir, tokenizer)
+        elif self.task in ['squad_generation']:
+            self.processor = SQuADProcessor(self.data_dir, tokenizer)
+        else:
+            raise NotImplementedError
+        example_list = self.processor.create_examples(split)
+        self.example_list = example_list
+        self.examples = {example.guid: example for example in example_list}
+
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        cls_id = self.tokenizer.get_command('ENC').Id
+        mask_token = 'sMASK' if self.args.task_mask else 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        pad_id = self.tokenizer.get_command('pad').Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        if self.task in ['gigaword', 'cnn_dm', 'cnn_dm_original', 'xsum']:
+            source_text, target_text = example.text_a, example.text_b
+            source_tokens = self.tokenizer.EncodeAsIds(
+                ' ' + source_text).tokenization
+            prompt = [cls_id, mask_id
+                      ] + self.tokenizer.EncodeAsIds(' Content:').tokenization
+            if len(source_tokens) > self.max_src_length - len(prompt):
+                source_tokens = source_tokens[:self.max_src_length
+                                              - len(prompt)]
+            source_tokens = prompt + source_tokens
+        elif self.task == 'squad_generation':
+            source_text = example.text_a
+            target_text, answer = example.meta['question'], example.meta[
+                'answer']
+            source_tokens = self.tokenizer.EncodeAsIds(
+                source_text.rstrip() + ' Question:').tokenization
+            answer_tokens = self.tokenizer.EncodeAsIds(' Answer: '
+                                                       + answer).tokenization
+            if len(source_tokens
+                   ) > self.max_src_length - len(answer_tokens) - 2:
+                max_src_length = self.max_src_length - len(answer_tokens) - 2
+                answer_pattern = self.tokenizer.EncodeAsIds(
+                    ' ' + answer).tokenization
+
+                def sub_finder(mylist, pattern):
+                    matches = []
+                    for i in range(len(mylist)):
+                        if mylist[i] == pattern[0] and mylist[
+                                i:i + len(pattern)] == pattern:
+                            matches.append(i)
+                    return matches
+
+                answer_indices = sub_finder(source_tokens, answer_pattern)
+                if len(answer_indices) == 0:
+                    print(f'Answer {answer} not exists in the source text')
+                    source_tokens = source_tokens[:max_src_length]
+                else:
+                    start_index = max(answer_indices[0] - max_src_length // 2,
+                                      0)
+                    source_tokens = source_tokens[start_index:start_index
+                                                  + max_src_length]
+            source_tokens = [cls_id] + source_tokens + [mask_id
+                                                        ] + answer_tokens
+        else:
+            raise NotImplementedError
+        if len(source_tokens) < self.max_src_length:
+            source_tokens = source_tokens + [pad_id] * (
+                self.max_src_length - len(source_tokens))
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        mask_pos = source_tokens.index(mask_id)
+        if self.split == 'train':
+            target_tokens = self.tokenizer.EncodeAsIds(
+                ' ' + target_text).tokenization
+            target_tokens = target_tokens + [eop_id]
+            if len(target_tokens) > self.max_tgt_length:
+                target_tokens = target_tokens[:self.max_tgt_length]
+            loss_mask = [1] * len(target_tokens)
+            if len(target_tokens) < self.max_tgt_length:
+                loss_mask += [0] * (self.max_tgt_length - len(target_tokens))
+                target_tokens += [pad_id] * (
+                    self.max_tgt_length - len(target_tokens))
+            tokens = source_tokens + [sop_id] + target_tokens[:-1]
+            loss_mask = [0] * len(source_tokens) + loss_mask
+            target_ids = [0] * len(source_tokens) + target_tokens
+            position_ids += [mask_pos] * len(target_tokens)
+            if self.args.no_block_position:
+                block_position_ids += [1] * len(target_tokens)
+            else:
+                block_position_ids += list(range(1, len(target_tokens) + 1))
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+
+class ExtractionDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        task, data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        self.tokenizer = tokenizer
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'valid'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating {task}-{split} dataset from {data_dir}')
+        self.dataset_name = split
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(data_dir, f'{filename}.source'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                source_texts.append(line)
+        with open(
+                os.path.join(data_dir, f'{filename}.target'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                target_texts.append(line)
+        self.examples, self.example_list = {}, []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {'ref': target_text}
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            self.examples[guid] = example
+            self.example_list.append(example)
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        source_text, target_text = example.text_a, example.text_b
+        mask_token = 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        pad_id = self.tokenizer.get_command('pad').Id
+
+        def pad_to(text, max_len, pad_id):
+            if len(text) > max_len:
+                text = text[:max_len]
+            else:
+                text = text + [pad_id] * (max_len - len(text))
+            return text
+
+        source_tokens = self.tokenizer.EncodeAsIds(source_text).tokenization
+        masked_tgt = target_text.split('|')
+        source_tokens = pad_to(source_tokens, self.max_src_length, pad_id)
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        if self.split == 'train':
+            mask_positions = [
+                i for i, x in enumerate(source_tokens) if x == mask_id
+            ]
+            assert len(mask_positions) <= len(masked_tgt)
+            tokens = source_tokens
+            target_ids = [0] * len(source_tokens)
+            loss_mask = [0] * len(source_tokens)
+            for i, mask_pos in enumerate(mask_positions):
+                tgt_text = masked_tgt[i]
+                tgt_tokens = self.tokenizer.EncodeAsIds(
+                    ' ' + tgt_text).tokenization
+                tokens += [sop_id] + tgt_tokens
+                target_ids += tgt_tokens + [eop_id]
+                loss_mask += [1] * (len(tgt_tokens) + 1)
+                position_ids += [mask_pos] * (len(tgt_tokens) + 1)
+                block_position_ids += [
+                    i + 1 for i in range(len(tgt_tokens) + 1)
+                ]
+            tokens = pad_to(tokens, self.max_src_length + self.max_tgt_length,
+                            pad_id)
+            target_ids = pad_to(target_ids,
+                                self.max_src_length + self.max_tgt_length,
+                                pad_id)
+            loss_mask = pad_to(loss_mask,
+                               self.max_src_length + self.max_tgt_length, 0)
+            position_ids = pad_to(position_ids,
+                                  self.max_src_length + self.max_tgt_length, 0)
+            block_position_ids = pad_to(
+                block_position_ids, self.max_src_length + self.max_tgt_length,
+                0)
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            mask_pos = source_tokens.index(mask_id)
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+
+class BlankLMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, args, split, tokenizer):
+        self.args = args
+        task, data_dir = args.task.lower(), args.data_dir
+        self.max_src_length, self.max_tgt_length = args.src_seq_length, args.tgt_seq_length
+        self.split = split
+        assert args.tokenizer_type == 'BertWordPieceTokenizer'
+        self.tokenizer = tokenizer
+        if split == 'train':
+            filename = 'train'
+        elif split == 'dev':
+            filename = 'valid'
+        elif split == 'test':
+            filename = 'test'
+        else:
+            raise NotImplementedError(split)
+        print_rank_0(f'Creating {task}-{split} dataset from {data_dir}')
+        self.dataset_name = split
+        detokenizer = blanklm_detokenize
+        source_texts, target_texts = [], []
+        with open(
+                os.path.join(data_dir, f'{filename}.txt'),
+                encoding='utf-8') as file:
+            for line in file:
+                line = line.strip()
+                line = detokenizer(line) if detokenizer else line
+                target_texts.append(line)
+        if split == 'test':
+            with open(
+                    os.path.join(
+                        data_dir,
+                        f'blank/test.maskratio{args.blank_maskratio:.1f}.blank'
+                    ),
+                    encoding='utf-8') as file:
+                for line in file:
+                    line = line.strip()
+                    line = detokenizer(line) if detokenizer else line
+                    source_texts.append(line)
+        else:
+            source_texts = target_texts
+        self.examples, self.example_list = {}, []
+        for idx, (source_text,
+                  target_text) in enumerate(zip(source_texts, target_texts)):
+            # if idx > 10000:
+            #     break
+            if (idx + 1) % 20000 == 0:
+                print_rank_0(f'Complete {idx + 1} examples')
+            guid = '%s-%s' % (split, idx)
+            meta = {'ref': target_text}
+            example = InputExample(
+                guid=guid, text_a=source_text, text_b=target_text, meta=meta)
+            self.examples[guid] = example
+            self.example_list.append(example)
+        print_rank_0(f'Return {len(self.examples)} {split} examples')
+        self.random = random.Random(args.seed)
+
+    def __len__(self):
+        return len(self.example_list)
+
+    def __getitem__(self, idx):
+        example = self.example_list[idx]
+        source_text, target_text = example.text_a, example.text_b  # noqa
+        mask_token = 'gMASK' if self.args.task_mask else 'MASK'
+        mask_id = self.tokenizer.get_command(mask_token).Id
+        sop_id = self.tokenizer.get_command('sop').Id
+        eop_id = self.tokenizer.get_command('eop').Id
+        pad_id = self.tokenizer.get_command('pad').Id
+        if self.split in ['train', 'dev']:
+            masked_src, masked_tgt = self.mask_text(source_text)
+            source_text = masked_src
+
+        def pad_to(text, max_len, pad_id):
+            if len(text) > max_len:
+                text = text[:max_len]
+            else:
+                text = text + [pad_id] * (max_len - len(text))
+            return text
+
+        source_tokens = self.tokenizer.EncodeAsIds(' '
+                                                   + source_text).tokenization
+        source_tokens = pad_to(source_tokens, self.max_src_length, pad_id)
+        sep = len(source_tokens)
+        position_ids = list(range(len(source_tokens)))
+        block_position_ids = [0] * len(source_tokens)
+        if self.split in ['train', 'dev']:
+            mask_positions = [
+                i for i, x in enumerate(source_tokens) if x == mask_id
+            ]
+            assert len(mask_positions) <= len(masked_tgt)
+            tokens = source_tokens
+            target_ids = [0] * len(source_tokens)
+            loss_mask = [0] * len(source_tokens)
+            for i, mask_pos in enumerate(mask_positions):
+                tgt_text = masked_tgt[i]
+                tgt_tokens = self.tokenizer.EncodeAsIds(
+                    ' ' + tgt_text).tokenization
+                tokens += [sop_id] + tgt_tokens
+                target_ids += tgt_tokens + [eop_id]
+                loss_mask += [1] * (len(tgt_tokens) + 1)
+                position_ids += [mask_pos] * (len(tgt_tokens) + 1)
+                block_position_ids += [
+                    i + 1 for i in range(len(tgt_tokens) + 1)
+                ]
+            max_length = self.max_src_length + int(
+                self.max_src_length * self.args.blank_maskratio)
+            tokens = pad_to(tokens, max_length, pad_id)
+            target_ids = pad_to(target_ids, max_length, pad_id)
+            loss_mask = pad_to(loss_mask, max_length, 0)
+            position_ids = pad_to(position_ids, max_length, 0)
+            block_position_ids = pad_to(block_position_ids, max_length, 0)
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'target': np.array(target_ids, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'loss_mask': np.array(loss_mask, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        else:
+            tokens = source_tokens + [sop_id]
+            mask_pos = source_tokens.index(mask_id)
+            position_ids = position_ids + [mask_pos]
+            block_position_ids = block_position_ids + [1]
+            position_ids = [position_ids, block_position_ids]
+            sample = {
+                'text': np.array(tokens, dtype=np.int64),
+                'attention_mask': np.array(sep, dtype=np.int64),
+                'position_id': np.array(position_ids, dtype=np.int64),
+                'uid': example.guid
+            }
+        return sample
+
+    def mask_text(self, text):
+        tokens = text.split()
+        mask_ratio = self.args.blank_maskratio
+        n = len(tokens)
+        indices = sorted(self.random.sample(range(n), int(n * mask_ratio)))
+        masked_src, masked_tgt = '', []
+        for i, idx in enumerate(indices):
+            if i == 0 or idx != indices[i - 1] + 1:
+                masked_tgt.append('')
+            masked_tgt[-1] += ' ' + tokens[idx]
+            tokens[idx] = '[MASK]'
+        for i, token in enumerate(tokens):
+            if i != 0 and token == '[MASK]' and tokens[i - 1] == '[MASK]':
+                continue
+            masked_src += ' ' + token
+        return masked_src, masked_tgt
--- a/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
@@ -0,0 +1,538 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import datetime
+import random
+import string
+
+import mpu
+import torch
+import torch.nn.functional as F
+from generation_utils import (BeamSearchScorer, LogitsProcessorList,
+                              MinLengthLogitsProcessor,
+                              NoRepeatNGramLogitsProcessor)
+from rouge_score import rouge_scorer
+from utils import print_rank_0
+
+
+def _is_digit(w):
+    for ch in w:
+        if not (ch.isdigit() or ch == ','):
+            return False
+    return True
+
+
+gigaword_tok_dict = {
+    '(': '-lrb-',
+    ')': '-rrb-',
+    '[': '-lsb-',
+    ']': '-rsb-',
+    '{': '-lcb-',
+    '}': '-rcb-',
+    '[UNK]': 'UNK',
+    '&': '&amp;',
+    '<': '&lt;',
+    '>': '&gt;'
+}
+
+cnndm_tok_dict = {
+    '(': '-LRB-',
+    ')': '-RRB-',
+    '[': '-LSB-',
+    ']': '-RSB-',
+    '{': '-LCB-',
+    '}': '-RCB-'
+}
+
+
+def fix_tokenization(text, dataset):
+    if dataset == 'cnn_dm_org':
+        return text
+    if dataset == 'gigaword':
+        text = text.replace('[UNK]', 'UNK')
+        return text
+    input_tokens = text.split()
+    output_tokens = []
+    has_left_quote = False
+    has_left_single_quote = False
+
+    i = 0
+    prev_dash = False
+    while i < len(input_tokens):
+        tok = input_tokens[i]
+        flag_prev_dash = False
+        if tok == "\"":
+            if has_left_quote:
+                output_tokens.append("''")
+            else:
+                output_tokens.append('``')
+            has_left_quote = not has_left_quote
+            i += 1
+        elif tok == "'" and len(
+                output_tokens) > 0 and output_tokens[-1].endswith(
+                    'n') and i < len(input_tokens) - 1 and input_tokens[
+                        i + 1] == 't':  # noqa
+            output_tokens[-1] = output_tokens[-1][:-1]
+            output_tokens.append("n't")
+            i += 2
+        elif tok == "'" and i < len(input_tokens) - 1 and input_tokens[
+                i + 1] in ('s', 'd', 'll'):
+            output_tokens.append("'" + input_tokens[i + 1])
+            i += 2
+        elif tok == "'":
+            if has_left_single_quote:
+                output_tokens.append("'")
+            else:
+                output_tokens.append('`')
+            has_left_single_quote = not has_left_single_quote
+            i += 1
+        elif tok == '.' and i < len(input_tokens) - 2 and input_tokens[
+                i + 1] == '.' and input_tokens[i + 2] == '.':
+            output_tokens.append('...')
+            i += 3
+        elif tok == ',' and len(output_tokens) > 0 and _is_digit(
+                output_tokens[-1]) and i < len(input_tokens) - 1 and _is_digit(
+                    input_tokens[i + 1]):
+            # $ 3 , 000 -> $ 3,000
+            output_tokens[-1] += ',' + input_tokens[i + 1]
+            i += 2
+        elif tok == '.' and len(output_tokens) > 0 and output_tokens[-1].isdigit() and i < len(input_tokens) - 1 and \
+                input_tokens[i + 1].isdigit():
+            # 3 . 03 -> $ 3.03
+            output_tokens[-1] += '.' + input_tokens[i + 1]
+            i += 2
+        elif tok == '.' and len(output_tokens) > 0 and len(
+                output_tokens[-1]) == 1 and output_tokens[-1].isalpha(  # noqa
+                ) and i < len(input_tokens) - 2 and len(  # noqa
+                    input_tokens[i + 1]) == 1 and input_tokens[
+                        i + 1].isalpha(  # noqa
+                        ) and input_tokens[i + 2] == '.':  # noqa
+            # U . N . -> U.N.
+            k = i + 3
+            while k + 2 < len(input_tokens):
+                if len(input_tokens[k + 1]) == 1 and input_tokens[
+                        k + 1].isalpha() and input_tokens[k + 2] == '.':
+                    k += 2
+                else:
+                    break
+            output_tokens[-1] += ''.join(input_tokens[i:k])
+            i = k
+        elif tok == '-':
+            if i < len(input_tokens) - 1 and input_tokens[i + 1] == '-':
+                output_tokens.append('--')
+                i += 2
+            elif i == len(input_tokens) - 1 or i == 0:
+                output_tokens.append('-')
+                i += 1
+            elif output_tokens[-1] not in string.punctuation and input_tokens[
+                    i + 1][0] not in string.punctuation:
+                output_tokens[-1] += '-'
+                i += 1
+                flag_prev_dash = True
+            else:
+                output_tokens.append('-')
+                i += 1
+        elif prev_dash and len(
+                output_tokens) > 0 and tok[0] not in string.punctuation:
+            output_tokens[-1] += tok
+            i += 1
+        else:
+            output_tokens.append(tok)
+            i += 1
+        prev_dash = flag_prev_dash
+    return ' '.join(output_tokens)
+
+
+def count_tokens(tokens):
+    counter = {}
+    for t in tokens:
+        if t in counter.keys():
+            counter[t] += 1
+        else:
+            counter[t] = 1
+    return counter
+
+
+def get_f1(text_a, text_b):
+    tokens_a = text_a.lower().split()
+    tokens_b = text_b.lower().split()
+    if len(tokens_a) == 0 or len(tokens_b) == 0:
+        return 1 if len(tokens_a) == len(tokens_b) else 0
+    set_a = count_tokens(tokens_a)
+    set_b = count_tokens(tokens_b)
+    match = 0
+    for token in set_a.keys():
+        if token in set_b.keys():
+            match += min(set_a[token], set_b[token])
+    p = match / len(tokens_a)
+    r = match / len(tokens_b)
+    return 2.0 * p * r / (p + r + 1e-5)
+
+
+def remove_duplicate(l_list, duplicate_rate):
+    tk_list = [l.lower().split() for l in l_list]  # noqa
+    r_list = []
+    history_set = set()
+    for i, w_list in enumerate(tk_list):
+        w_set = set(w_list)
+        if len(w_set & history_set) / len(w_set) <= duplicate_rate:
+            r_list.append(l_list[i])
+        history_set |= w_set
+    return r_list
+
+
+def rouge_metric(predictions,
+                 labels,
+                 examples,
+                 metric='rouge-1',
+                 duplicate_rate=0.7,
+                 dataset='cnn_dm'):
+    metric_dict = {
+        'rouge-1': 'rouge1',
+        'rouge-2': 'rouge2',
+        'rouge-l': 'rougeLsum'
+    }
+    refs = [example.meta['ref'] for example in examples]
+    ref_list = []
+    for ref in refs:
+        ref = ref.strip().split('[SEP]')
+        ref = [fix_tokenization(sentence, dataset=dataset) for sentence in ref]
+        ref = '\n'.join(ref)
+        ref_list.append(ref)
+    pred_list = []
+    for prediction in predictions:
+        buf = []
+        for sentence in prediction.strip().split('[SEP]'):
+            sentence = fix_tokenization(sentence, dataset=dataset)
+            if any(get_f1(sentence, s) > 1.0 for s in buf):
+                continue
+            s_len = len(sentence.split())
+            if s_len <= 4:
+                continue
+            buf.append(sentence)
+        if duplicate_rate and duplicate_rate < 1:
+            buf = remove_duplicate(buf, duplicate_rate)
+        line = '\n'.join(buf)
+        pred_list.append(line)
+    if torch.distributed.get_rank() == 0:
+        import json
+        with open('./results.json', 'w') as output:
+            for ref, pred in zip(ref_list, pred_list):
+                output.write(json.dumps({'ref': ref, 'pred': pred}) + '\n')
+    scorer = rouge_scorer.RougeScorer([metric_dict[metric]], use_stemmer=True)
+    scores = [
+        scorer.score(pred, ref) for pred, ref in zip(pred_list, ref_list)
+    ]
+    scores = [score[metric_dict[metric]].fmeasure for score in scores]
+    scores = sum(scores) / len(scores)
+    return scores
+
+
+def process_batch(batch, args):
+    """Process batch and produce inputs for the model."""
+    tokens = batch['text'].long().cuda()
+    attention_mask = batch['attention_mask'].long().cuda()
+    position_ids = batch['position_id'].long().cuda()
+    return tokens, attention_mask, position_ids
+
+
+class DecoderEvaluater:
+
+    def __init__(self, args, tokenizer):
+        self.tokenizer = tokenizer
+        self.start_token = tokenizer.get_command('sop').Id
+        self.end_token = tokenizer.get_command('eop').Id
+        self.mask_token = tokenizer.get_command(
+            'sMASK').Id if args.task_mask else tokenizer.get_command('MASK').Id
+        self.pad_token = tokenizer.get_command('pad').Id
+        self.processors = LogitsProcessorList()
+        if args.min_tgt_length > 0:
+            processor = MinLengthLogitsProcessor(args.min_tgt_length,
+                                                 self.end_token)
+            self.processors.append(processor)
+        if args.no_repeat_ngram_size > 0:
+            processor = NoRepeatNGramLogitsProcessor(args.no_repeat_ngram_size)
+            self.processors.append(processor)
+
+    def evaluate(self, model, dataloader, example_dict, args):
+        """Calculate correct over total answers and return prediction if the
+        `output_predictions` is true."""
+        model.eval()
+        store = torch.distributed.TCPStore(args.master_ip,
+                                           18931 + random.randint(0, 10000),
+                                           mpu.get_data_parallel_world_size(),
+                                           torch.distributed.get_rank() == 0,
+                                           datetime.timedelta(seconds=30))
+        print_rank_0('Distributed store created')
+        with torch.no_grad():
+            # For all the batches in the dataset.
+            for idx, data in enumerate(dataloader):
+                tokens, attention_mask, position_ids = process_batch(
+                    data, args)
+                batch_size = tokens.size(0)
+                beam_scorer = BeamSearchScorer(
+                    batch_size=batch_size,
+                    max_length=args.out_seq_length,
+                    num_beams=args.num_beams,
+                    device=tokens.device,
+                    length_penalty=args.length_penalty,
+                    do_early_stopping=False,
+                )
+                beam_scores = torch.zeros((batch_size, args.num_beams),
+                                          dtype=torch.float,
+                                          device=tokens.device)
+                beam_scores[:, 1:] = -1e9
+                beam_scores = beam_scores.view((batch_size * args.num_beams, ))
+                # Run the model forward.
+                counter = 0
+                while counter < args.tgt_seq_length:
+                    if counter == 0:
+                        next_token_logits, *mems = model(
+                            tokens,
+                            position_ids,
+                            attention_mask,
+                            return_memory=True)
+                        seq_length = next_token_logits.size(1)
+                        next_token_logits = next_token_logits[:, -1]
+                        next_token_logits = next_token_logits.unsqueeze(
+                            1).repeat(1, args.num_beams,
+                                      1).view(batch_size * args.num_beams, -1)
+                        mems = [
+                            mem.unsqueeze(1).repeat(
+                                1, args.num_beams, 1,
+                                1).view(batch_size * args.num_beams,
+                                        seq_length, -1) for mem in mems
+                        ]
+                        position_ids = tokens.new_ones(batch_size,
+                                                       args.num_beams, 2, 1)
+                        for i, text in enumerate(tokens.tolist()):
+                            mask_pos = text.index(self.mask_token)
+                            position_ids[i, :, 0] = mask_pos
+                        position_ids = position_ids.reshape(
+                            batch_size * args.num_beams, 2, 1)
+                        tokens = tokens.new_zeros(batch_size * args.num_beams,
+                                                  0)
+                        attention_mask = tokens.new_zeros(
+                            [batch_size * args.num_beams])
+                    else:
+                        if not args.no_block_position:
+                            position_ids[:, 1] = counter + 1
+                        last_token = tokens[:, -1:]
+                        next_token_logits, *mems = model(
+                            last_token,
+                            position_ids,
+                            attention_mask,
+                            *mems,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                    next_token_scores = F.log_softmax(
+                        next_token_logits, dim=-1)
+                    next_token_scores = self.processors(
+                        tokens, next_token_scores)
+                    next_token_scores = next_token_scores + beam_scores[:, None].expand_as(
+                        next_token_scores)
+                    vocab_size = next_token_scores.shape[-1]
+                    next_token_scores = next_token_scores.view(
+                        batch_size, args.num_beams * vocab_size)
+
+                    probs = F.softmax(next_token_scores, dim=-1)
+                    if args.select_topk:
+                        _, next_tokens = torch.topk(
+                            probs, k=2 * args.num_beams, dim=-1, largest=True)
+                    else:
+                        next_tokens = torch.multinomial(
+                            probs, num_samples=2 * args.num_beams)
+                    next_token_scores = torch.gather(next_token_scores, -1,
+                                                     next_tokens)
+                    next_token_scores, _indices = torch.sort(
+                        next_token_scores, descending=True, dim=1)
+                    next_tokens = torch.gather(next_tokens, -1, _indices)
+
+                    next_indices = next_tokens // vocab_size
+                    next_tokens = next_tokens % vocab_size
+                    # stateless
+                    beam_outputs = beam_scorer.process(
+                        tokens,
+                        next_token_scores,
+                        next_tokens,
+                        next_indices,
+                        eos_token_id=self.end_token,
+                        pad_token_id=self.pad_token)
+                    beam_scores = beam_outputs['next_beam_scores']
+                    beam_next_tokens = beam_outputs['next_beam_tokens']
+                    beam_idx = beam_outputs['next_beam_indices']
+                    beam_next_tokens = beam_next_tokens.unsqueeze(-1)
+                    tokens = torch.cat([tokens[beam_idx, :], beam_next_tokens],
+                                       dim=-1)
+                    mems = [mem[beam_idx] for mem in mems] if mems else []
+                    if beam_scorer.is_done:
+                        break
+                    counter += 1
+                tokens, _ = beam_scorer.finalize(
+                    tokens,
+                    beam_scores,
+                    next_tokens,
+                    next_indices,
+                    eos_token_id=self.end_token,
+                    pad_token_id=self.pad_token)
+                predictions = []
+                for text in tokens.tolist():
+                    text = [
+                        token for token in text
+                        if token not in [self.end_token, self.pad_token]
+                    ]
+                    text = self.tokenizer.DecodeIds(text)
+                    predictions.append(text)
+                uid_list = data['uid']
+                if isinstance(uid_list, torch.Tensor):
+                    uid_list = uid_list.cpu().numpy().tolist()
+                for uid, prediction in zip(uid_list, predictions):
+                    store.set(uid, prediction)
+                if (idx + 1) % args.log_interval == 0:
+                    print_rank_0(f'Iteration {idx + 1} / {len(dataloader)}')
+        model.train()
+        torch.distributed.barrier()
+        print_rank_0('Evaluation completed')
+        predictions, examples = [], []
+        for uid, example in example_dict.items():
+            predictions.append(store.get(uid).decode('utf-8'))
+            examples.append(example)
+        torch.distributed.barrier()
+        return predictions, [], examples
+
+
+def blanklm_fix_tokenization(text):
+    text = text.replace('` `', '``')
+    text = text.replace("\' \'", "\'\'")
+    text = text.replace("n \' t", "n\'t")
+    text = text.replace("\' s", "\'s")
+    text = text.replace("\' m", "\'m")
+    text = text.replace("\' re", "\'re")
+    text = text.replace('. . .', '...')
+    text = text.replace(' . .', ' ..')
+    text = text.replace('- -', '--')
+    text = text.replace('u . s .', 'u.s.')
+    text = text.replace('u . k .', 'u.k.')
+    text = text.replace('e . g .', 'e.g.')
+    return text
+
+
+class BlankLMEvaluater(DecoderEvaluater):
+
+    def evaluate(self, model, dataloader, example_dict, args):
+        model.eval()
+        store = torch.distributed.TCPStore(args.master_ip,
+                                           18931 + random.randint(0, 10000),
+                                           mpu.get_data_parallel_world_size(),
+                                           torch.distributed.get_rank() == 0,
+                                           datetime.timedelta(seconds=30))
+        print_rank_0('Distributed store created')
+
+        with torch.no_grad():
+            for idx, data in enumerate(dataloader):
+                tokens, attention_mask, position_ids = process_batch(
+                    data, args)
+                src_tokens = tokens
+                batch_size = tokens.size(0)
+                mask_positions = []
+                current_mask = []
+                for text in tokens.tolist():
+                    mask_positions.append([
+                        i for i, x in enumerate(text) if x == self.mask_token
+                    ])
+                    current_mask.append(0)
+                    # print(self.tokenizer.DecodeIds(text))
+                    # print(mask_positions[-1])
+                counter = 0
+                done = [False] * batch_size
+                while counter < args.tgt_seq_length:
+                    if counter == 0:
+                        # print(tokens)
+                        # print(position_ids)
+                        next_token_logits, *mems = model(
+                            tokens,
+                            position_ids,
+                            attention_mask,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                        position_ids = tokens.new_ones(batch_size, 2, 1)
+                        for i, text in enumerate(tokens.tolist()):
+                            mask_pos = mask_positions[i][current_mask[i]]
+                            position_ids[i, 0] = mask_pos
+                        tokens = tokens.new_zeros(batch_size, 0)
+                        attention_mask = tokens.new_zeros(batch_size)
+                    else:
+                        position_ids[:, 1] = position_ids[:, 1] + 1
+                        last_token = tokens[:, -1:]
+                        next_token_logits, *mems = model(
+                            last_token,
+                            position_ids,
+                            attention_mask,
+                            *mems,
+                            return_memory=True)
+                        next_token_logits = next_token_logits[:, -1]
+                    next_token_scores = F.log_softmax(
+                        next_token_logits, dim=-1)
+                    next_token_scores = self.processors(
+                        tokens, next_token_scores)
+                    next_tokens = next_token_scores.max(dim=-1)[1]
+                    # print(self.tokenizer.DecodeIds(next_tokens.tolist()))
+                    for i, next_token in enumerate(next_tokens.tolist()):
+                        if next_token == self.end_token:
+                            if current_mask[i] + 1 < len(mask_positions[i]):
+                                current_mask[i] += 1
+                                next_tokens[i] = self.start_token
+                                position_ids[i, 0] = mask_positions[i][
+                                    current_mask[i]]
+                                position_ids[i, 1] = 0
+                            else:
+                                done[i] = True
+                        if done[i]:
+                            next_tokens[i] = self.pad_token
+                    if all(done):
+                        break
+                    tokens = torch.cat(
+                        [tokens, next_tokens.unsqueeze(-1)], dim=-1)
+                    counter += 1
+                predictions = []
+                for i, text in enumerate(tokens.tolist()):
+                    text = [
+                        token for token in text
+                        if token not in [self.end_token, self.pad_token]
+                    ]
+                    blanks = [[]]
+                    for token in text:
+                        if token == self.start_token:
+                            blanks.append([])
+                        else:
+                            blanks[-1].append(token)
+                    output_tokens = []
+                    current_blank = 0
+                    for token in src_tokens[i].tolist():
+                        if token == self.mask_token:
+                            if current_blank < len(blanks):
+                                output_tokens += blanks[current_blank]
+                            current_blank += 1
+                        else:
+                            if token not in [self.pad_token]:
+                                output_tokens.append(token)
+                    text = self.tokenizer.DecodeIds(output_tokens[:-1])
+                    text = blanklm_fix_tokenization(text)
+                    predictions.append(text)
+                    # print(text)
+                uid_list = data['uid']
+                if isinstance(uid_list, torch.Tensor):
+                    uid_list = uid_list.cpu().numpy().tolist()
+                for uid, prediction in zip(uid_list, predictions):
+                    store.set(uid, prediction)
+                if (idx + 1) % args.log_interval == 0:
+                    print_rank_0(f'Iteration {idx + 1} / {len(dataloader)}')
+
+        model.train()
+        torch.distributed.barrier()
+        print_rank_0('Evaluation completed')
+        predictions, examples = [], []
+        for uid, example in example_dict.items():
+            predictions.append(store.get(uid).decode('utf-8'))
+            examples.append(example)
+        torch.distributed.barrier()
+        return predictions, [], examples
--- a/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Race."""
+import functools
+from collections import OrderedDict
+
+import mpu
+import torch
+from finetune_glm import finetune
+from pretrain_glm import get_batch
+from tasks.eval_utils import accuracy_func_provider
+from tasks.seq2seq.dataset import (BlankLMDataset, ExtractionDataset,
+                                   Seq2SeqDataset)
+from tasks.seq2seq.evaluate import (BlankLMEvaluater, DecoderEvaluater,
+                                    rouge_metric)
+
+global_tokenizer = None
+
+
+def seq2seq_forward_step(data, model, args, timers, mems):
+    """Forward step."""
+
+    # Get the batch.
+    if timers is not None:
+        timers('batch generator').start()
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data, args)
+    if timers is not None:
+        timers('batch generator').stop()
+    # Forward model.
+    logits, *mems = model(tokens, position_ids, attention_mask, *mems)
+    # logits, loss_mask = logits[:, args.src_seq_length:], loss_mask[:, args.src_seq_length:]
+    # target_ids = target_ids[:, args.src_seq_length:]
+    losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(),
+                                              labels)
+    if args.label_smoothing > 0.0:
+        epsilon = args.label_smoothing
+        smooth_loss = -torch.nn.functional.log_softmax(
+            logits, dim=-1).mean(dim=-1)
+        losses = (1 - epsilon) * losses + epsilon * smooth_loss
+    loss_mask = loss_mask.reshape(-1)
+    # The loss is not normalized for fair comparison
+    loss = torch.sum(losses.reshape(-1) * loss_mask) / loss_mask.sum()
+    return loss, mems, 'bert'
+
+
+def train_valid_datasets_provider(args, tokenizer):
+    """Provide train and validation datasets."""
+    if args.task.lower() == 'blank':
+        train_dataset = BlankLMDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    elif args.task.lower() == 'extraction':
+        train_dataset = ExtractionDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    else:
+        train_dataset = Seq2SeqDataset(
+            args, split='train', tokenizer=tokenizer)
+        valid_dataset = None
+    global global_tokenizer
+    global_tokenizer = tokenizer
+    return train_dataset, valid_dataset
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Provide metrics callback function."""
+
+    def single_dataset_provider(split):
+        if args.task.lower() == 'blank':
+            return BlankLMDataset(args, split=split, tokenizer=tokenizer)
+        elif args.task.lower() == 'extraction':
+            return ExtractionDataset(args, split=split, tokenizer=tokenizer)
+        else:
+            return Seq2SeqDataset(args, split=split, tokenizer=tokenizer)
+
+    if args.task.lower() in ['blank', 'extraction']:
+        evaluater = BlankLMEvaluater(args, tokenizer)
+        eval_func = evaluater.evaluate
+        metric_dict = {}
+    else:
+        evaluater = DecoderEvaluater(args, tokenizer)
+        eval_func = evaluater.evaluate
+        if args.tokenizer_type == 'BertWordPieceTokenizer':
+            dataset = 'cnn_dm'
+        elif args.task.lower() == 'gigaword':
+            dataset = 'gigaword'
+        else:
+            dataset = 'cnn_dm_org'
+        metric_dict = OrderedDict({
+            'rouge-1':
+            functools.partial(rouge_metric, metric='rouge-1', dataset=dataset),
+            'rouge-2':
+            functools.partial(rouge_metric, metric='rouge-2', dataset=dataset),
+            'rouge-l':
+            functools.partial(rouge_metric, metric='rouge-l', dataset=dataset)
+        })
+
+    def output_func(predictions, examples, output_file):
+        with open(output_file + '.hyps', 'w', encoding='utf-8') as output:
+            for prediction in predictions:
+                output.write(prediction)
+                output.write('\n')
+        with open(output_file + '.refs', 'w', encoding='utf-8') as output:
+            for example in examples:
+                output.write(example.meta['ref'])
+                output.write('\n')
+        if args.task.lower() == 'squad_generation':
+            with open(
+                    output_file + '.source', 'w', encoding='utf-8') as output:
+                for example in examples:
+                    output.write(
+                        example.text_a.replace('\n', ' ') + ' Answer: '
+                        + example.meta['answer'])
+                    output.write('\n')
+
+    return accuracy_func_provider(
+        single_dataset_provider,
+        metric_dict,
+        args,
+        is_test=is_test,
+        eval_func=eval_func,
+        output_func=output_func,
+        only_rank0=False)
+
+
+def main(args):
+    if args.src_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.src_seq_length
+    if args.task.lower() in [
+            'cnn_dm', 'cnn_dm_original', 'gigaword', 'blank',
+            'squad_generation', 'xsum', 'extraction'
+    ]:
+        finetune(
+            args,
+            train_valid_datasets_provider, {},
+            end_of_epoch_callback_provider=metrics_func_provider,
+            forward_step=seq2seq_forward_step)
+    else:
+        raise NotImplementedError(args.task)
--- a/modelscope/models/nlp/mglm/tasks/superglue/README.md
+++ b/modelscope/models/nlp/mglm/tasks/superglue/README.md
@@ -0,0 +1,137 @@
+# Use GLM for your NLU tasks
+To use GLM for your own NLU tasks, you should implement a subclass of `DataProcessor` in [tasks/superglue/dataset.py](dataset.py) and a subclass of `PVP` in [tasks/superglue/pvp.py](pvp.py). You should also specify the  We will take the RTE and ReCoRD tasks in SuperGLUE as an example.
+
+## 1. Design your patterns
+RTE is an NLI task in which the model is required to predict text entailment between a premise and a hypothesis. The label can be `entailment` or `not_entailment` One sample from the training set is
+```
+premise: No Weapons of Mass Destruction Found in Iraq Yet.
+hypothesis: Weapons of Mass Destruction Found in Iraq.
+label: not_entailment
+```
+We design the pattern as
+```
+"`hypothesis`"?, [MASK], "`premise`"
+```
+GLM predicts "Yes" for `entailment` and "No" for `not_entailment`. "Yes" and "No" are called verbalizers for `entailment` and `not_entailment`.
+
+ReCoRD is a multi-choice QA task. Each example consists of a news article and a Cloze-style question about the article in which one entity is masked out. The system must predict the masked out entity from a list of possible entities in the provided passage. We directly adopt the cloze-style question as our pattern and use GLM to predict the masked entity.
+
+## 2. Implement subclass of `DataProcessor`
+A subclass of `DataProcessor` should implement `get_train_examples`, `get_dev_examples` and `get_test_examples`, which return the examples of the train, dev, and test sets. The returned value is a list of `InputExample`. It should also implement `get_labels` to return the list of possible labels. Hete we take the `RTEProcessor` as an example:
+```python
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set."""
+
+    def get_train_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "train.jsonl"), "train")
+
+    def get_dev_examples(self, data_dir, for_train=False):
+        return self._create_examples(os.path.join(data_dir, "val.jsonl"), "dev")
+
+    def get_test_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "test.jsonl"), "test")
+
+    def get_unlabeled_examples(self, data_dir):
+        return self._create_examples(os.path.join(data_dir, "unlabeled.jsonl"), "unlabeled")
+
+    def get_labels(self):
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis",
+                         premise_name: str = "premise") -> List[InputExample]:
+        examples = []
+
+        with open(path, encoding='utf8') as f:
+            for line_idx, line in enumerate(f):
+                example_json = json.loads(line)
+                idx = example_json['idx']
+                if isinstance(idx, str):
+                    try:
+                        idx = int(idx)
+                    except ValueError:
+                        idx = line_idx
+                label = example_json.get('label')
+                guid = "%s-%s" % (set_type, idx)
+                text_a = example_json[premise_name]
+                text_b = example_json[hypothesis_name]
+
+                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx)
+                examples.append(example)
+
+        return examples
+```
+After that, you should add the implemented class to ``PROCESSORS`` at the end of [tasks/superglue/dataset.py](dataset.py):
+```python
+PROCESSORS = {
+    ...
+    "rte": RteProcessor
+}
+```
+
+## 3. Implement subclass of `PVP`
+To implement a subclass of `PVP`, you should first decide your verbalizers is single-token or multi-token. The verbalizers in RTE, "Yes" and "No" are single-token. Instead, the verbalizers in ReCoRD are multi-token, as one entity can be tokenized into multiple tokens with WordPiece or BPE tokenizer.
+
+For single-token task, you should set `is_multi_token=False` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `verbalize` to return the verbalizer given a label. Take `RTEPVP` as an example:
+```python
+class RtePVP(PVP):
+    is_multi_token = False
+    VERBALIZER = {
+        "not_entailment": [" No"],
+        "entailment": [" Yes"]
+    }
+
+    @property
+    def spell_length(self):
+        return self.pattern_id
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        # switch text_a and text_b to get the correct order
+        text_a = example.text_a
+        text_b = example.text_b.rstrip(string.punctuation)
+        return ['"', self.shortenable(text_b), '" ?'], [[self.mask], ', "', self.shortenable(text_a), '"']
+
+    def verbalize(self, label) -> List[str]:
+        return RtePVP.VERBALIZER[label]
+```
+We use `PvP.shortenable` to mark the segments that can be truncated when exceeding the maximum sequence length.
+
+For multi-token task, you should set `is_multi_token=True` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `get_answers` to return the candidates. Take `ReCoRDPVP` as an example:
+```python
+class RecordPVP(PVP):
+    is_multi_token = True
+
+    def get_answers(self, example: InputExample):
+        choices = example.meta['candidates']
+        choices = [" " + choice for choice in choices]
+        return choices
+
+    def get_parts(self, example: InputExample) -> FilledPattern:
+        premise = self.shortenable(example.text_a)
+
+        assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token'
+        question_a, question_b = example.text_b.split('@placeholder')
+        return [premise, " " + question_a.rstrip(), [self.mask], question_b], []
+```
+After that, you should implement the class to `PVPS` at the end of [tasks/superglue/pvp.py](pvp.py):
+```python
+PVPS = {
+    ...
+    'rte': RtePVP,
+    'record': RecordPVP
+}
+```
+## 4. Run the experiment
+To run the experiment for your new task, you should create a config file like [config_tasks/task_rte.sh](/config_tasks/task_rte.sh). You should also specify the evaluation metrics for the task in `DEFAULT_METRICS` of [tasks/superglue/finetune.py](finetune.py):
+```python
+DEFAULT_METRICS = {
+    ...
+    "record": [("EM", qa_exact_match), ("F1", qa_f1)],
+    "rte": [("accuracy", accuracy_metric)]
+}
+```
+Then you can run the experiment with [finetune_superglue.sh](/scripts/finetune_superglue.sh):
+```shell
+bash scripts/finetune_superglue.sh \
+     config_tasks/model_blocklm_large.sh \
+     config_tasks/task_rte.sh
+```
--- a/modelscope/models/nlp/mglm/tasks/superglue/init.py
+++ b/modelscope/models/nlp/mglm/tasks/superglue/init.py
--- a/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
+++ b/modelscope/models/nlp/mglm/tasks/superglue/dataset.py
--- a/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py
+++ b/modelscope/models/nlp/mglm/tasks/superglue/evaluate.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2022 Zhipu.AI
+"""
+Official evaluation script for ReCoRD v1.0.
+(Some functions are adopted from the SQuAD evaluation script.)
+"""
+
+from __future__ import print_function
+import functools
+import re
+import string
+from collections import Counter, defaultdict
+from typing import List
+
+from tasks.data_utils import InputExample
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    if not ground_truths:
+        return 0.0
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def qa_evaluate(predictions, labels, examples: List[InputExample], metric):
+    assert len(examples) == len(predictions)
+    score = 0.0
+    for example, prediction in zip(examples, predictions):
+        ground_truths = example.meta['answers']
+        prediction = example.meta['candidates'][prediction]
+        if ground_truths:
+            score += metric_max_over_ground_truths(metric, prediction,
+                                                   ground_truths)
+    score = 100.0 * score / len(predictions)
+    return score
+
+
+def multirc_em(predictions, labels, examples: List[InputExample]):
+    """Compute the exact match (EM) for a sequence of predictions and actual labels"""
+    question_ids = [example.meta['question_idx'] for example in examples]
+    unique_questions = set(question_ids)
+
+    q_actuals = list(zip(question_ids, labels))
+    q_predictions = list(zip(question_ids, predictions))
+
+    actuals_per_question = defaultdict(list)
+    predictions_per_question = defaultdict(list)
+
+    for qid, val in q_actuals:
+        actuals_per_question[qid].append(val)
+    for qid, val in q_predictions:
+        predictions_per_question[qid].append(val)
+
+    em = 0
+    for qid in unique_questions:
+        if actuals_per_question[qid] == predictions_per_question[qid]:
+            em += 1
+    em /= len(unique_questions)
+    return em
+
+
+qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score)
+qa_f1 = functools.partial(qa_evaluate, metric=f1_score)
--- a/modelscope/models/nlp/mglm/tasks/superglue/finetune.py
+++ b/modelscope/models/nlp/mglm/tasks/superglue/finetune.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Race."""
+
+from collections import OrderedDict
+
+from finetune_glm import finetune
+from tasks.eval_utils import (accuracy_func_provider, accuracy_metric,
+                              f1_macro_metric, f1_metric)
+from tasks.superglue.dataset import (CLASSIFICATION_DATASETS,
+                                     MULTI_CHOICE_DATASETS, PROCESSORS,
+                                     SuperGlueDataset, get_output_func)
+from tasks.superglue.evaluate import multirc_em, qa_exact_match, qa_f1
+from tasks.superglue.pvp import PVPS
+
+DEFAULT_METRICS = {
+    'record': [('EM', qa_exact_match), ('F1', qa_f1)],
+    'copa': [('accuracy', accuracy_metric)],
+    'rte': [('accuracy', accuracy_metric)],
+    'boolq': [('accuracy', accuracy_metric)],
+    'wic': [('accuracy', accuracy_metric)],
+    'wsc': [('accuracy', accuracy_metric)],
+    'cb': [('accuracy', accuracy_metric), ('f1-macro', f1_macro_metric)],
+    'multirc': [('f1a', f1_metric), ('em', multirc_em),
+                ('acc', accuracy_metric)],
+    'mnli': [('accuracy', accuracy_metric)],
+    'sst2': [('accuracy', accuracy_metric)],
+    'qnli': [('accuracy', accuracy_metric)],
+    'qqp': [('accuracy', accuracy_metric)],
+    'mrpc': [('accuracy', accuracy_metric)],
+    'cola': [('accuracy', accuracy_metric)],
+    'squad': [('accuracy', accuracy_metric)],
+}
+
+
+def train_valid_datasets_provider(args, tokenizer, pattern_text=False):
+    """Provide train and validation datasets."""
+    task_name = args.task.lower()
+    data_dir = args.data_dir
+    train_dataset = SuperGlueDataset(
+        args,
+        task_name,
+        data_dir,
+        args.seq_length,
+        'train',
+        tokenizer,
+        pattern_text=pattern_text)
+    valid_dataset = SuperGlueDataset(
+        args,
+        task_name,
+        data_dir,
+        args.seq_length,
+        'dev',
+        tokenizer,
+        for_train=True,
+        pattern_text=pattern_text)
+
+    return train_dataset, valid_dataset
+
+
+def metrics_func_provider(args, tokenizer, is_test):
+    """Privde metrics callback function."""
+
+    def single_dataset_provider(split):
+        return SuperGlueDataset(args, args.task.lower(), args.data_dir,
+                                args.seq_length, split, tokenizer)
+
+    output_func = get_output_func(args.task.lower(), args)
+    eval_func = None
+    if args.task.lower() in ['wsc', 'squad'
+                             ] and args.cloze_eval and not args.wsc_negative:
+        from tasks.language_model.finetune import classify_evaluate
+        eval_func = classify_evaluate
+    metric_dict = OrderedDict(DEFAULT_METRICS[args.task.lower()])
+    return accuracy_func_provider(
+        single_dataset_provider,
+        metric_dict,
+        args,
+        is_test=is_test,
+        eval_func=eval_func,
+        output_func=output_func,
+        only_rank0=False,
+        tokenizer=tokenizer)
+
+
+def main(args):
+    model_kwargs = {}
+    processor = PROCESSORS[args.task.lower()](args)
+    pvp = PVPS[args.task.lower()](
+        args,
+        None,
+        processor.get_labels(),
+        args.seq_length,
+        pattern_id=args.pattern_id,
+        is_multi_token=args.multi_token,
+        num_prompt_tokens=args.num_prompt_tokens)
+    if args.continuous_prompt:
+        model_kwargs['spell_length'] = pvp.spell_length
+    if args.task.lower() in ['wsc', 'squad'
+                             ] and args.cloze_eval and not args.wsc_negative:
+        from tasks.language_model.finetune import lm_forward_step
+        finetune(
+            args,
+            train_valid_datasets_provider,
+            model_kwargs,
+            end_of_epoch_callback_provider=metrics_func_provider,
+            forward_step=lm_forward_step)
+    else:
+        if args.cloze_eval:
+            multi_token = pvp.is_multi_token
+        else:
+            multi_token = args.task.lower() in MULTI_CHOICE_DATASETS
+        args.multi_token = multi_token
+        if not multi_token:
+            model_kwargs[
+                'model_type'] = 'multiple_choice' if args.cloze_eval else 'classification'
+            model_kwargs['multi_token'] = False
+            model_kwargs['num_labels'] = len(processor.get_labels())
+        else:
+            model_kwargs['model_type'] = 'multiple_choice'
+            model_kwargs['multi_token'] = True
+            model_kwargs['num_labels'] = 1
+        finetune(
+            args,
+            train_valid_datasets_provider,
+            model_kwargs,
+            end_of_epoch_callback_provider=metrics_func_provider)
--- a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
+++ b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
--- a/modelscope/models/nlp/mglm/test/init.py
+++ b/modelscope/models/nlp/mglm/test/init.py
--- a/modelscope/models/nlp/mglm/test/test_block.py
+++ b/modelscope/models/nlp/mglm/test/test_block.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import random
+from argparse import Namespace
+
+import numpy as np
+from blocklm_utils import ConstructBlockStrategy
+
+
+# rng = random.Random()
+# span_lengths = [2, 3, 4, 2, 3, 4]
+# length = 100
+#
+# counts = np.array([0] * length)
+# for _ in range(10000):
+#     rng.shuffle(span_lengths)
+#     spans = ConstructBlockStrategy.sample_spans(span_lengths, length, rng)
+#     for start, end in spans:
+#         counts[start: end] += 1
+# print(counts)
+def main():
+    args = Namespace()
+    args.seq_length = 10
+    args.eod_token = 0
+
+    strategy = ConstructBlockStrategy(
+        args, None, bert_ratio=0.4, max_seq_length=128)
+    counts = np.array([0] * 10)
+    for _ in range(10000):
+        spans = strategy.sample_span_in_document(
+            np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1],
+            random.Random())
+        for start, end in spans:
+            counts[start:end] += 1
+
+    print(counts)
--- a/modelscope/models/nlp/mglm/test/test_rel_shift.py
+++ b/modelscope/models/nlp/mglm/test/test_rel_shift.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import matplotlib.pyplot as plt
+import numpy as np
+from learning_rates import AnnealingLR
+from torch.nn.modules import Linear
+from torch.optim import Adam
+
+
+def main():
+    model = Linear(10, 10)
+    optimizer = Adam(model.parameters())
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=0.00015,
+        warmup_iter=3000,
+        num_iters=300000,
+        decay_style='cosine',
+        decay_ratio=0.1)
+    steps = np.arange(0, 400000, 10, dtype=np.long)
+    rates = []
+    for step in steps:
+        lr_scheduler.num_iters = step
+        rates.append(lr_scheduler.get_lr())
+    print(rates)
+    plt.plot(steps, rates)
+    plt.savefig('lr.pdf', format='pdf')
--- a/modelscope/models/nlp/mglm/train_utils.py
+++ b/modelscope/models/nlp/mglm/train_utils.py
@@ -0,0 +1,472 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import deepspeed
+import torch
+from apex.optimizers import FusedAdam as Adam
+from torch import distributed as dist
+
+from . import mpu
+from .fp16 import DynamicLossScaler, FP16_Module, FP16_Optimizer
+from .model import DistributedDataParallel as LocalDDP
+from .model import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast,
+                    GLMForSequenceClassification, GLMForSingleTokenCloze,
+                    GLMModel)
+from .model import PyTorchDistributedDataParallel as TorchDDP
+from .model import glm_get_params_for_weight_decay_optimization
+from .utils import get_checkpoint_iteration, get_checkpoint_name, print_rank_0
+
+
+def load_pretrained(model, checkpoint_path, args, task_tokens=None):
+    load_dir, tag, release, success = get_checkpoint_iteration(checkpoint_path)
+    checkpoint_name = get_checkpoint_name(load_dir, tag, release)
+    if mpu.get_data_parallel_rank() == 0:
+        print('global rank {} is loading pretrained model {}'.format(
+            torch.distributed.get_rank(), checkpoint_name))
+    # Load the checkpoint.
+    sd = torch.load(checkpoint_name, map_location='cpu')
+    if args.deepspeed:
+        model = model.module
+    if isinstance(model, TorchDDP):
+        model = model.module
+    if isinstance(model, FP16_Module):
+        model = model.module
+    if hasattr(model, 'model'):
+        model = model.model
+
+    # Model.
+    def extend_embedding_weights(state_weights, model_weights):
+        original_length = state_weights.shape[0]
+        assert original_length <= args.max_position_embeddings + 1
+        new_weights = model_weights.clone()
+        new_weights[:original_length] = state_weights
+        return new_weights
+
+    if args.block_lm:
+        if 'transformer.block_position_embeddings.weight' in sd['module']:
+            position_weights = sd['module'][
+                'transformer.position_embeddings.weight']
+            if args.max_position_embeddings + 1 > position_weights.shape[0]:
+                sd['module'][
+                    'transformer.position_embeddings.weight'] = extend_embedding_weights(
+                        position_weights,
+                        model.state_dict()
+                        ['transformer.position_embeddings.weight'].data)
+                print_rank_0(
+                    f'Extend position embedding to {args.max_position_embeddings + 1}'
+                )
+        if 'transformer.block_position_embeddings.weight' in sd['module']:
+            block_position_weights = sd['module'][
+                'transformer.block_position_embeddings.weight']
+            if args.max_position_embeddings + 1 > block_position_weights.shape[
+                    0]:
+                sd['module'][
+                    'transformer.block_position_embeddings.weight'] = extend_embedding_weights(
+                        block_position_weights,
+                        model.state_dict()
+                        ['transformer.block_position_embeddings.weight'].data)
+                print_rank_0(
+                    f'Extend block position embedding to {args.max_position_embeddings + 1}'
+                )
+    for key in list(model.state_dict().keys()):
+        print(key)
+        model.state_dict()[key.replace(
+            'mixins.block_position_embedding.block_position_embeddings.weight',
+            'transformer.block_position_embeddings.weight').replace(
+                'transformer.word_embeddings.weight',
+                'word_embeddings.weight')] = model.state_dict().pop(key)
+
+    missing_keys, unexpected_keys = model.load_state_dict(
+        sd['module'], strict=False)
+    if missing_keys or unexpected_keys:
+        print_rank_0(
+            f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}')
+    if args.continuous_prompt and args.prompt_init:
+        model.prompt_spell.init_embedding(model.word_embeddings.weight.data,
+                                          task_tokens)
+
+
+def get_model(args,
+              model_type=None,
+              multi_token=True,
+              num_labels=None,
+              spell_length=None):
+    """Build the model."""
+    print_rank_0('building GPT2 model ...')
+    if args.pretrained_bert:
+        if model_type == 'multiple_choice':
+            model = BertForMultipleChoice.from_pretrained(
+                args.tokenizer_model_type,
+                cache_dir=args.cache_dir,
+                fp32_layernorm=args.fp32_layernorm,
+                fp32_embedding=args.fp32_embedding,
+                layernorm_epsilon=args.layernorm_epsilon)
+        elif model_type == 'classification':
+            model = BertForSequenceClassification.from_pretrained(
+                args.tokenizer_model_type,
+                cache_dir=args.cache_dir,
+                fp32_layernorm=args.fp32_layernorm,
+                fp32_embedding=args.fp32_embedding,
+                layernorm_epsilon=args.layernorm_epsilon,
+                num_labels=num_labels)
+        else:
+            raise NotImplementedError
+    else:
+        output_predict, paralle_output = True, True
+        if (model_type == 'multiple_choice'
+                or model_type == 'classification') and not args.cloze_eval:
+            output_predict = False
+        if model_type is not None:
+            paralle_output = False
+        if spell_length is not None:
+            print_rank_0(f'Continuous spell length {spell_length}')
+        model = GLMModel(
+            num_layers=args.num_layers,
+            vocab_size=args.vocab_size,
+            hidden_size=args.hidden_size,
+            num_attention_heads=args.num_attention_heads,
+            embedding_dropout_prob=args.hidden_dropout,
+            attention_dropout_prob=args.attention_dropout,
+            output_dropout_prob=args.hidden_dropout,
+            max_sequence_length=args.max_position_embeddings,
+            max_memory_length=args.mem_length,
+            checkpoint_activations=args.checkpoint_activations,
+            checkpoint_num_layers=args.checkpoint_num_layers,
+            parallel_output=paralle_output,
+            relative_encoding=args.transformer_xl,
+            block_position_encoding=args.block_lm and not args.masked_lm,
+            output_predict=output_predict,
+            spell_length=spell_length,
+            spell_func=args.prompt_func,
+            attention_scale=args.attention_scale)
+        if args.freeze_transformer:
+            model.freeze_transformer(
+                tune_prefix_layers=args.tune_prefix_layers)
+        if model_type is not None:
+            if model_type == 'multiple_choice':
+                if args.cloze_eval:
+                    if multi_token:
+                        if args.fast_decode:
+                            model = GLMForMultiTokenClozeFast(
+                                model, length_penalty=args.length_penalty)
+                        else:
+                            model = GLMForMultiTokenCloze(
+                                model, length_penalty=args.length_penalty)
+                    else:
+                        model = GLMForSingleTokenCloze(
+                            model, take_softmax=args.adapet)
+                else:
+                    model = GLMForSequenceClassification(
+                        model,
+                        args.hidden_size,
+                        args.output_dropout,
+                        args.pool_token,
+                        num_class=num_labels)
+            elif model_type == 'classification':
+                model = GLMForSequenceClassification(
+                    model,
+                    args.hidden_size,
+                    args.output_dropout,
+                    args.pool_token,
+                    num_class=num_labels)
+            elif model_type == 'generation':
+                pass
+            else:
+                raise NotImplementedError(model_type)
+
+    if mpu.get_data_parallel_rank() == 0:
+        print(
+            ' > number of parameters on model parallel rank {}: {}'.format(
+                mpu.get_model_parallel_rank(),
+                sum([p.nelement() for p in model.parameters()])),
+            flush=True)
+
+    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
+    if args.fp16:
+        model.half()
+
+    # GPU allocation.
+    model.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if args.fp16:
+        model = FP16_Module(model)
+
+    # Wrap model for distributed training.
+    if not args.deepspeed and (args.train_iters or args.epochs):
+        if args.DDP_impl == 'torch':
+            i = torch.cuda.current_device()
+            model = TorchDDP(
+                model,
+                device_ids=[i],
+                output_device=i,
+                process_group=mpu.get_data_parallel_group())
+        elif args.DDP_impl == 'local':
+            model = LocalDDP(model)
+        else:
+            print_rank_0('Skip DDP model')
+    return model
+
+
+def get_optimizer_param_groups(model):
+    # Build parameter groups (weight decay and non-decay).
+    while isinstance(model, (LocalDDP, TorchDDP, FP16_Module)):
+        model = model.module
+    param_groups = glm_get_params_for_weight_decay_optimization(model)
+
+    # Add model parallel attribute if it is not set.
+    for param_group in param_groups:
+        # print('## param_group', len(param_group['params']))
+        for param in param_group['params']:
+            if not hasattr(param, 'model_parallel'):
+                param.model_parallel = False
+
+    return param_groups
+
+
+def get_optimizer(param_groups, args):
+    """Set up the optimizer."""
+    if args.cpu_optimizer:
+        # Apex FusedAdam uses decoupled weight decay so use the same here
+        if args.cpu_torch_adam:
+            cpu_adam_optimizer = torch.optim.AdamW
+        else:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+            cpu_adam_optimizer = DeepSpeedCPUAdam
+        optimizer = cpu_adam_optimizer(
+            param_groups, lr=args.lr, weight_decay=args.weight_decay)
+    else:
+        # Use FusedAdam.
+        if args.optimizer == 'adam':
+            optimizer = Adam(
+                param_groups,
+                lr=args.lr,
+                weight_decay=args.weight_decay,
+                betas=(args.adam_beta1, args.adam_beta2),
+                eps=args.adam_eps)
+        elif args.optimizer == 'adafactor':
+            from transformers import Adafactor
+            optimizer = Adafactor(
+                param_groups,
+                lr=args.lr,
+                relative_step=False,
+                warmup_init=False)
+        else:
+            raise NotImplementedError
+
+    print(f'Optimizer = {optimizer.__class__.__name__}')
+    if hasattr(args, 'deepspeed') and args.deepspeed:
+        raise NotImplementedError
+        # fp16 wrapper is not required for DeepSpeed.
+        # return optimizer
+
+    # Wrap into fp16 optimizer.
+    if args.fp16:
+        optimizer = FP16_Optimizer(
+            optimizer,
+            static_loss_scale=args.loss_scale,
+            dynamic_loss_scale=args.dynamic_loss_scale,
+            dynamic_loss_args={
+                'scale_window': args.loss_scale_window,
+                'min_scale': args.min_scale,
+                'delayed_shift': args.hysteresis
+            })
+
+    return optimizer
+
+
+def get_learning_rate_scheduler(optimizer, args):
+    """Build the learning rate scheduler."""
+
+    # Add linear learning rate scheduler.
+    if args.lr_decay_iters is not None:
+        num_iters = args.lr_decay_iters
+    else:
+        num_iters = args.train_iters
+    if args.finetune:
+        num_iters = num_iters // args.gradient_accumulation_steps
+    num_iters = max(1, num_iters)
+    init_step = -1
+    warmup_iter = args.warmup * num_iters
+    lr_scheduler = AnnealingLR(
+        optimizer,
+        start_lr=args.lr,
+        warmup_iter=warmup_iter,
+        num_iters=num_iters - warmup_iter,
+        decay_style=args.lr_decay_style,
+        last_iter=init_step,
+        decay_ratio=args.lr_decay_ratio)
+
+    return lr_scheduler
+
+
+def setup_model_and_optimizer(args,
+                              model_type=None,
+                              multi_token=True,
+                              num_labels=None,
+                              spell_length=None):
+    """Setup model and optimizer."""
+
+    model = get_model(
+        args,
+        model_type=model_type,
+        multi_token=multi_token,
+        num_labels=num_labels,
+        spell_length=spell_length)
+    param_groups = get_optimizer_param_groups(model)
+
+    if args.train_data is not None or args.data_dir is not None and (
+            args.epochs > 0 or args.train_iters > 0):
+        if args.deepspeed:
+            print_rank_0('DeepSpeed is enabled.')
+
+            model, optimizer, _, _ = deepspeed.initialize(
+                model=model,
+                model_parameters=param_groups,
+                args=args,
+                mpu=mpu,
+                dist_init_required=False)
+        else:
+            optimizer = get_optimizer(param_groups, args)
+        lr_scheduler = get_learning_rate_scheduler(optimizer, args)
+    else:
+        optimizer, lr_scheduler = None, None
+
+    return model, optimizer, lr_scheduler
+
+
+def backward_step(optimizer, model, lm_loss, args, timers):
+    """Backward step."""
+
+    # Total loss.
+    loss = lm_loss
+
+    # Backward pass.
+    if args.deepspeed:
+        model.backward(loss)
+    else:
+        # optimizer.zero_grad()
+        if args.fp16:
+            optimizer.backward(loss, update_master_grads=False)
+        else:
+            loss.backward()
+
+    if args.deepspeed or args.DDP_impl == 'torch':
+        # DeepSpeed backward propagation already addressed all reduce communication.
+        # Reset the timer to avoid breaking timer logs below.
+        timers('allreduce').reset()
+    else:
+        timers('allreduce').start()
+        model.allreduce_params(
+            reduce_after=False, fp32_allreduce=args.fp32_allreduce)
+        timers('allreduce').stop()
+
+    # Update master gradients.
+    if not args.deepspeed:
+        if args.fp16:
+            optimizer.update_master_grads()
+
+        # Clipping gradients helps prevent the exploding gradient.
+        if args.clip_grad > 0:
+            if not args.fp16:
+                mpu.clip_grad_norm(model.parameters(), args.clip_grad)
+            else:
+                optimizer.clip_master_grads(args.clip_grad)
+
+    return lm_loss
+
+
+def see_memory_usage(message, force=False):
+    if not force:
+        return
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print(message)
+        print('Memory Allocated ',
+              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Max Memory Allocated ',
+              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Cache Allocated ',
+              torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
+        print('Max cache Allocated ',
+              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print(' ')
+        # input("Press Any Key To Continue ..")
+
+
+def train_step(data_iterator,
+               model,
+               optimizer,
+               lr_scheduler,
+               args,
+               timers,
+               forward_step_func,
+               mems=None,
+               single_step=False):
+    """Single training step."""
+    lm_loss_total, count = 0.0, 0
+    mems = [] if mems is None else mems
+    if not args.deepspeed:
+        optimizer.zero_grad()
+    while True:
+        skipped_iter, complete = 0, False
+        # Forward model for one step.
+        timers('forward').start()
+        lm_loss, mems, _ = forward_step_func(data_iterator, model, args,
+                                             timers, mems)
+        timers('forward').stop()
+        # print_rank_0("Forward step")
+        if not args.deepspeed:
+            lm_loss /= args.gradient_accumulation_steps
+
+        reduced_loss = lm_loss.detach().clone().view(1)
+        torch.distributed.all_reduce(
+            reduced_loss.data, group=mpu.get_data_parallel_group())
+        reduced_loss.data = reduced_loss.data / (
+            args.world_size / args.model_parallel_size)
+
+        if not DynamicLossScaler._has_inf_or_nan(reduced_loss):
+            lm_loss_total += reduced_loss
+            count += 1
+
+            # Calculate gradients, reduce across processes, and clip.
+            timers('backward').start()
+            backward_step(optimizer, model, lm_loss, args, timers)
+            timers('backward').stop()
+            # print_rank_0("Backward step")
+            # Update parameters.
+            timers('optimizer').start()
+            if args.deepspeed:
+                if model.is_gradient_accumulation_boundary():
+                    model.step()
+                    complete = True
+                    if not (args.fp16 and optimizer.overflow):
+                        lr_scheduler.step()
+                    else:
+                        skipped_iter = 1
+                else:
+                    model.step()
+            else:
+                if count == args.gradient_accumulation_steps:
+                    optimizer.step()
+                    complete = True
+                    # Update learning rate.
+                    if not (args.fp16 and optimizer.overflow):
+                        lr_scheduler.step()
+                    else:
+                        skipped_iter = 1
+            # print_rank_0("Optimizer step")
+            timers('optimizer').stop()
+            if complete:
+                break
+        else:
+            print_rank_0('Found NaN loss, skip backward')
+            del lm_loss, reduced_loss
+            mems = []
+        if single_step:
+            break
+    if args.deepspeed:
+        lm_loss_total = lm_loss_total / count
+    return lm_loss_total, skipped_iter, mems
--- a/modelscope/models/nlp/mglm/utils.py
+++ b/modelscope/models/nlp/mglm/utils.py
@@ -0,0 +1,529 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for logging and serialization"""
+
+import os
+import random
+import subprocess
+import time
+
+import json
+import numpy as np
+import torch
+
+from . import mpu
+from .fp16 import FP16_Optimizer
+
+SUMMARY_WRITER_DIR_NAME = 'runs'
+
+
+def get_log_dir(name, base):
+    return os.path.join(base, SUMMARY_WRITER_DIR_NAME, name)
+
+
+def print_rank_0(message):
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+
+def get_hostname():
+    hostname_cmd = ['hostname -I']
+    result = subprocess.check_output(hostname_cmd, shell=True)
+    master_addr = result.decode('utf-8').split()[0]
+    return master_addr
+
+
+def get_spare_port(args):
+    if torch.distributed.get_rank() == 0:
+        port = subprocess.check_output(['shuf -n 1 -i 10000-65535'],
+                                       shell=True)
+        port = int(port.strip())
+        if port == args.master_port:
+            port = subprocess.check_output(['shuf -n 1 -i 10000-65535'],
+                                           shell=True)
+            port = int(port.strip())
+        port = torch.cuda.LongTensor([port])
+    else:
+        port = torch.cuda.LongTensor([0])
+    torch.distributed.broadcast(port, 0)
+    port = port.item()
+    return port
+
+
+def print_and_save_args(args, verbose=True, log_dir=None):
+    """Print arguments."""
+    if verbose:
+        print('arguments:', flush=True)
+        for arg in vars(args):
+            dots = '.' * (29 - len(arg))
+            print(
+                '  {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
+    if log_dir is not None:
+        json_file = os.path.join(log_dir, 'config.json')
+        with open(json_file, 'w') as output:
+            json.dump(vars(args), output, sort_keys=True)
+        if args.deepspeed and args.deepspeed_config is not None:
+            with open(args.deepspeed_config) as file:
+                deepspeed_config = json.load(file)
+            deepspeed_json_file = os.path.join(log_dir,
+                                               'config_gpt_large.json')
+            with open(deepspeed_json_file, 'w') as output:
+                json.dump(deepspeed_config, output)
+
+
+def print_params_min_max_norm(optimizer, iteration):
+    """Print min, max, and norm of all parameters."""
+    index = 0
+    rank = torch.distributed.get_rank()
+    string = 'iteration, rank, index, model-parallel,min, max, norm\n'
+    optimizer_ = optimizer
+    if isinstance(optimizer, FP16_Optimizer):
+        optimizer_ = optimizer.optimizer
+    for param_group in optimizer_.param_groups:
+        for param in param_group['params']:
+            index += 1
+            min_ = param.data.min()
+            max_ = param.data.max()
+            norm = param.data.norm()
+            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
+                iteration, rank, index, int(param.model_parallel))
+            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
+    print(string, flush=True)
+
+
+class Timers:
+    """Group of timers."""
+
+    class Timer:
+        """Timer."""
+
+        def __init__(self, name):
+            self.name_ = name
+            self.elapsed_ = 0.0
+            self.started_ = False
+            self.start_time = time.time()
+
+        def start(self):
+            """Start the timer."""
+            assert not self.started_, 'timer has already been started'
+            torch.cuda.synchronize()
+            self.start_time = time.time()
+            self.started_ = True
+
+        def stop(self):
+            """Stop the timer."""
+            assert self.started_, 'timer is not started'
+            torch.cuda.synchronize()
+            self.elapsed_ += (time.time() - self.start_time)
+            self.started_ = False
+
+        def reset(self):
+            """Reset timer."""
+            self.elapsed_ = 0.0
+            self.started_ = False
+
+        def elapsed(self, reset=True):
+            """Calculate the elapsed time."""
+            started_ = self.started_
+            # If the timing in progress, end it first.
+            if self.started_:
+                self.stop()
+            # Get the elapsed time.
+            elapsed_ = self.elapsed_
+            # Reset the elapsed time
+            if reset:
+                self.reset()
+            # If timing was in progress, set it back.
+            if started_:
+                self.start()
+            return elapsed_
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = self.Timer(name)
+        return self.timers[name]
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        print_rank_0(string)
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(torch.cuda.memory_allocated()
+                                        / mega_bytes)
+    string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated()
+                                            / mega_bytes)
+    string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
+    string += ' | max cached: {}'.format(torch.cuda.memory_reserved()
+                                         / mega_bytes)
+    print_rank_0(string)
+
+
+def get_checkpoint_name(checkpoints_path,
+                        iteration,
+                        release=False,
+                        zero=False):
+    if release:
+        d = 'release'
+    else:
+        d = '{}'.format(iteration)
+    if zero:
+        dp_rank = mpu.get_data_parallel_rank()
+        d += '_zero_dp_rank_{}'.format(dp_rank)
+    return os.path.join(
+        checkpoints_path, d,
+        'mp_rank_{:02d}_model_states.pt'.format(mpu.get_model_parallel_rank()))
+
+
+def ensure_directory_exists(filename):
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname, exist_ok=True)
+
+
+def get_checkpoint_tracker_filename(checkpoints_path):
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+
+
+def save_zero_checkpoint(args, iteration, optimizer):
+    zero_sd = {
+        'iteration': iteration,
+        'optimizer_state_dict': optimizer.state_dict()
+    }
+    zero_checkpoint_name = get_checkpoint_name(args.save, iteration, zero=True)
+    ensure_directory_exists(zero_checkpoint_name)
+    torch.save(zero_sd, zero_checkpoint_name)
+    print('  successfully saved {}'.format(zero_checkpoint_name))
+
+
+def save_checkpoint(iteration,
+                    model,
+                    optimizer,
+                    lr_scheduler,
+                    args,
+                    tag=None,
+                    barrier=True,
+                    only_changed_parameters=False,
+                    no_deepspeed=False,
+                    no_save_optim=False):
+    """Save a model checkpoint."""
+    if tag is None:
+        tag = str(iteration)
+    if args.deepspeed and not no_deepspeed:
+        save_ds_checkpoint(iteration, model, lr_scheduler, args, tag=tag)
+    else:
+        # Only rank zer0 of the data parallel writes to the disk.
+
+        if mpu.get_data_parallel_rank() == 0:
+            checkpoint_name = get_checkpoint_name(args.save, tag)
+            print(
+                'global rank {} is saving checkpoint at iteration {:7d} to {}'.
+                format(torch.distributed.get_rank(), iteration,
+                       checkpoint_name))
+            sd = {'iteration': iteration}
+            if args.deepspeed:
+                model = model.module
+            state_dict = model.state_dict()
+            if only_changed_parameters:
+                requires_grad_dict = {}
+                for name, parameter in model.named_parameters():
+                    requires_grad_dict[name] = parameter.requires_grad
+                state_dict = {
+                    key: value
+                    for key, value in state_dict.items()
+                    if requires_grad_dict[key]
+                }
+            sd['module'] = state_dict
+
+            # Optimizer stuff.
+            if not args.no_save_optim and not no_save_optim:
+                if optimizer is not None:
+                    sd['optimizer'] = optimizer.state_dict()
+                if lr_scheduler is not None:
+                    sd['lr_scheduler'] = lr_scheduler.state_dict()
+
+            # rng states.
+            if not args.no_save_rng:
+                sd['random_rng_state'] = random.getstate()
+                sd['np_rng_state'] = np.random.get_state()
+                sd['torch_rng_state'] = torch.get_rng_state()
+                sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+                sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker(
+                ).get_states()
+
+            ensure_directory_exists(checkpoint_name)
+            torch.save(sd, checkpoint_name)
+            print('  successfully saved {}'.format(checkpoint_name))
+
+    # Wait so everyone is done (necessary)
+    if barrier:
+        torch.distributed.barrier()
+    # And update the latest iteration
+    if torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, 'w') as f:
+            f.write(tag)
+
+
+def save_ds_checkpoint(iteration, model, lr_scheduler, args, tag):
+    """Save a model checkpoint."""
+
+    sd = {}
+    sd['iteration'] = iteration
+    if lr_scheduler is not None:
+        sd['client_lr_scheduler'] = lr_scheduler.state_dict()
+    # rng states.
+    if not args.no_save_rng:
+        sd['random_rng_state'] = random.getstate()
+        sd['np_rng_state'] = np.random.get_state()
+        sd['torch_rng_state'] = torch.get_rng_state()
+        sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+        sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
+    model.save_checkpoint(args.save, tag, client_state=sd)
+
+
+def get_checkpoint_iteration(load_path):
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(load_path)
+    if not os.path.isfile(tracker_filename):
+        print_rank_0('WARNING: could not find the metadata file {} '.format(
+            tracker_filename))
+        if os.path.isdir(load_path):
+            path = os.path.normpath(load_path)
+            load_dir, tag = os.path.split(path)
+            print_rank_0(
+                'Try to directly load the checkpoint from the directory')
+            return load_dir, tag, False, True
+        print_rank_0('    will not load any checkpoints and will start from '
+                     'random')
+        return load_path, 0, False, False
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        release = metastring == 'release'
+        # try:
+        #     iteration = int(metastring)
+        # except ValueError:
+        #     release = metastring == 'release'
+        #     if not release:
+        #         print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+        #             tracker_filename))
+        #         exit()
+
+    # assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+    #     tracker_filename)
+
+    return load_path, metastring, release, True
+
+
+def load_checkpoint(model,
+                    optimizer,
+                    lr_scheduler,
+                    args,
+                    no_deepspeed=False,
+                    no_load_optim=False):
+    """Load a model checkpoint."""
+
+    load_dir, tag, release, success = get_checkpoint_iteration(args.load)
+
+    if not success:
+        return 0
+
+    if args.deepspeed and not no_deepspeed:
+
+        checkpoint_name, sd = model.load_checkpoint(
+            load_dir,
+            tag,
+            load_optimizer_states=not args.no_load_optim and not no_load_optim,
+            load_lr_scheduler_states=not args.no_load_lr_scheduler)
+        if not args.no_load_lr_scheduler and 'client_lr_scheduler' in sd:
+            lr_scheduler.load_state_dict(sd['client_lr_scheduler'])
+            print_rank_0('Load lr scheduler state')
+        if checkpoint_name is None:
+            if mpu.get_data_parallel_rank() == 0:
+                print('Unable to load checkpoint.')
+            return tag
+
+    else:
+
+        # Checkpoint.
+        checkpoint_name = get_checkpoint_name(load_dir, tag, release)
+
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        # Load the checkpoint.
+        sd = torch.load(checkpoint_name, map_location='cpu')
+
+        # Model.
+        if args.deepspeed:
+            model = model.module
+        missing_keys, unexpected_keys = model.load_state_dict(
+            sd['module'], strict=False)
+        if missing_keys or unexpected_keys:
+            print_rank_0(
+                f'Missing keys {missing_keys}, unexpected keys {unexpected_keys}'
+            )
+
+        # Optimizer.
+        if not release and not args.finetune and not args.no_load_optim and not no_load_optim:
+            try:
+                if optimizer is not None:
+                    optimizer.load_state_dict(sd['optimizer'])
+                if lr_scheduler is not None:
+                    lr_scheduler.load_state_dict(sd['lr_scheduler'])
+            except KeyError:
+                print_rank_0(
+                    'Unable to load optimizer from checkpoint {}, exiting. '
+                    'Specify --no-load-optim or --finetune to prevent '
+                    'attempting to load the optimizer '
+                    'state.'.format(checkpoint_name))
+
+    # Iterations.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = sd['iteration']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = sd['total_iters']
+            except KeyError:
+                print_rank_0(
+                    'A metadata file exists but Unable to load iteration '
+                    ' from checkpoint {}, starting from 0 iteration'.format(
+                        checkpoint_name))
+                iteration = 0
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(sd['random_rng_state'])
+            np.random.set_state(sd['np_rng_state'])
+            torch.set_rng_state(sd['torch_rng_state'])
+            torch.cuda.set_rng_state(sd['cuda_rng_state'])
+            mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
+        except KeyError:
+            print_rank_0(
+                'Unable to load random state from checkpoint {}, exiting. '
+                'Specify --no-load-rng or --finetune to prevent '
+                'attempting to load the random '
+                'state.'.format(checkpoint_name))
+
+    if mpu.get_data_parallel_rank() == 0:
+        print('  successfully loaded {}'.format(checkpoint_name))
+
+    return iteration
+
+
+def load_weights(src, dst, dst2src=False):
+    """
+    Loads weights from src to dst via in place copy.
+    src is a huggingface gpt2model, while dst is one of our models.
+    dst2src=True loads parameters from our models into huggingface's.
+    ^dst2src is still untested
+    """
+    conv_layer = 'Conv1D' in str(type(src))
+    for n, p in src.named_parameters():
+        if dst2src:
+            data = dst._parameters[n].data
+            load = p.data
+        else:
+            data = p.data
+            load = dst._parameters[n].data
+        if conv_layer and 'weight' in n:
+            data = data.t().contiguous()
+        load.copy_(data)
+
+
+#        dst._parameters[n].data.copy_(data)
+
+
+def load_mlp(our, oai, dst2src=False):
+    load_weights(oai.c_fc, our.dense_h_to_4h, dst2src)
+    load_weights(oai.c_proj, our.dense_4h_to_h, dst2src)
+
+
+def load_attention(our, oai, dst2src=False):
+    load_weights(oai.c_attn, our.query_key_value, dst2src)
+    load_weights(oai.c_proj, our.dense, dst2src)
+
+
+def load_transformer_layer(our, oai, dst2src=False):
+    load_weights(oai.ln_1, our.input_layernorm, dst2src)
+    load_weights(oai.ln_2, our.post_attention_layernorm, dst2src)
+    load_mlp(our.mlp, oai.mlp, dst2src)
+    load_attention(our.attention, oai.attn, dst2src)
+
+
+def move_weights(our, oai, dst2src=False):
+    """
+    Loads weights from `oai` to `our` via in place copy.
+    `oai` is a huggingface gpt2model, while `our` is one of our models.
+    dst2src=True loads parameters from our models into huggingface's.
+    ^dst2src=True is still untested
+    """
+    #    while isinstance(our, (torchDDP, model.distributed.DistributedDataParallel, FP16_Module)):
+    #        our=our.module
+    transformer_model = oai.transformer
+    load_weights(transformer_model.ln_f, our.transformer.final_layernorm,
+                 dst2src)
+    load_weights(transformer_model.wte, our.word_embeddings, dst2src)
+    load_weights(transformer_model.wpe, our.position_embeddings, dst2src)
+
+    for our_layer, oai_layer in zip(our.transformer.layers, oai.transformer.h):
+        load_transformer_layer(our_layer, oai_layer, dst2src)
+
+
+def debug_finetune_data(local_vars, batch_id, tokenizer):
+    tokens, target_ids = local_vars['tokens'], local_vars['target_ids']
+    attention_mask, logit_mask, position_ids = local_vars[
+        'attention_mask'], local_vars['logit_mask'], local_vars['position_ids']
+    output_tokens = []
+    sep = attention_mask[batch_id].item()
+    for i, token in enumerate(tokens[batch_id][:sep].tolist()):
+        token = tokenizer.IdToToken(token)
+        if token == '[MASK]':
+            token = f'[{position_ids[batch_id][0, i].item()}]'
+        output_tokens.append(token)
+    print(' '.join(output_tokens))
+    target_positions = []
+    for i in range(sep, tokens.size(-1)):
+        if logit_mask[batch_id][i]:
+            target_positions.append(i)
+    print(target_positions)
+    print(tokenizer.DecodeIds(tokens[batch_id][target_positions].tolist()))
+    if len(target_ids.shape) > 2:
+        print(
+            tokenizer.DecodeIds(
+                target_ids[batch_id][target_positions].tolist()))
+    else:
+        print(tokenizer.DecodeIds(target_ids[batch_id].tolist()))
+    print(position_ids[batch_id][:, target_positions])
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -516,6 +516,12 @@ TASK_OUTPUTS = {
    # }
    Tasks.text_generation: [OutputKeys.TEXT],

+    # summarization result for single sample
+    # {
+    #   "text": "this is the text generated by a model."
+    # }
+    Tasks.text_summarization: [OutputKeys.TEXT],
+
    # text generation result for single sample
    # {
    #   "text": "北京"
--- a/modelscope/pipelines/nlp/init.py
+++ b/modelscope/pipelines/nlp/init.py
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
    from .translation_pipeline import TranslationPipeline
    from .word_segmentation_pipeline import WordSegmentationPipeline
    from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
+    from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline
    from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \
        WordSegmentationThaiPipeline

@@ -71,6 +72,7 @@ else:
        'word_segmentation_pipeline': ['WordSegmentationPipeline'],
        'zero_shot_classification_pipeline':
        ['ZeroShotClassificationPipeline'],
+        'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'],
        'multilingual_word_segmentation_pipeline': [
            'MultilingualWordSegmentationPipeline',
            'WordSegmentationThaiPipeline'
--- a/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 Zhipu.AI
+
+from typing import Any, Dict, Optional, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.models.nlp import MGLMForTextSummarization
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (MGLMSummarizationPreprocessor,
+                                      Preprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['MGLMTextSummarizationPipeline']
+
+
+@PIPELINES.register_module(
+    group_key=Tasks.text_summarization,
+    module_name=Pipelines.mglm_text_summarization)
+class MGLMTextSummarizationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[MGLMForTextSummarization, str],
+                 preprocessor: [Preprocessor] = None,
+                 *args,
+                 **kwargs):
+        model = MGLMForTextSummarization(model) if isinstance(model,
+                                                              str) else model
+        self.model = model
+        self.model.eval()
+        if preprocessor is None:
+            preprocessor = MGLMSummarizationPreprocessor()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    # define the forward pass
+    def forward(self, inputs: Union[Dict, str],
+                **forward_params) -> Dict[str, Any]:
+        inputs = {'text': inputs} if isinstance(inputs, str) else inputs
+        return self.model.generate(inputs)
+
+    # format the outputs from pipeline
+    def postprocess(self, input, **kwargs) -> Dict[str, Any]:
+        return input
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -18,16 +18,16 @@ if TYPE_CHECKING:
    from .nlp import (
        DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor,
        FillMaskPoNetPreprocessor, NLPPreprocessor,
-        NLPTokenizerPreprocessorBase, TextRankingPreprocessor,
-        RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor,
-        SequenceClassificationPreprocessor, TokenClassificationPreprocessor,
-        TextErrorCorrectionPreprocessor, TextGenerationPreprocessor,
-        Text2TextGenerationPreprocessor, Tokenize,
+        NLPTokenizerPreprocessorBase, PassageRankingPreprocessor,
+        TextRankingPreprocessor, RelationExtractionPreprocessor,
+        SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor,
+        TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize,
        WordSegmentationBlankSetToLabelPreprocessor,
-        ZeroShotClassificationPreprocessor, TextGenerationJiebaPreprocessor,
-        SentencePiecePreprocessor, DialogIntentPredictionPreprocessor,
-        DialogModelingPreprocessor, DialogStateTrackingPreprocessor,
-        ConversationalTextToSqlPreprocessor,
+        MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor,
+        TextGenerationJiebaPreprocessor, SentencePiecePreprocessor,
+        DialogIntentPredictionPreprocessor, DialogModelingPreprocessor,
+        DialogStateTrackingPreprocessor, ConversationalTextToSqlPreprocessor,
        TableQuestionAnsweringPreprocessor, NERPreprocessorViet,
        NERPreprocessorThai, WordSegmentationPreprocessorThai)
    from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
@@ -57,6 +57,7 @@ else:
            'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
            'Tokenize', 'Text2TextGenerationPreprocessor',
            'WordSegmentationBlankSetToLabelPreprocessor',
+            'MGLMSummarizationPreprocessor',
            'ZeroShotClassificationPreprocessor',
            'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
            'NERPreprocessorViet', 'NERPreprocessorThai',
--- a/modelscope/preprocessors/nlp/init.py
+++ b/modelscope/preprocessors/nlp/init.py
@@ -29,6 +29,7 @@ if TYPE_CHECKING:
                        MultiWOZBPETextField, IntentBPETextField)
    from .space_T_en import ConversationalTextToSqlPreprocessor
    from .space_T_cn import TableQuestionAnsweringPreprocessor
+    from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
 else:
    _import_structure = {
        'nlp_base': [
@@ -62,6 +63,7 @@ else:
        'text_error_correction': [
            'TextErrorCorrectionPreprocessor',
        ],
+        'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'],
        'token_classification_thai_preprocessor': [
            'NERPreprocessorThai',
            'WordSegmentationPreprocessorThai',
--- a/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py
+++ b/modelscope/preprocessors/nlp/mglm_summarization_preprocessor.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os.path as osp
+import re
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+from modelscope.metainfo import Models, Preprocessors
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp import import_external_nltk_data
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.mglm_summarization)
+class MGLMSummarizationPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """preprocess the data
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
+        return data
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,18 +1,25 @@
+boto3
 en_core_web_sm>=2.3.5
+fasttext
+filelock
+ftfy
 jieba>=0.42.1
-megatron_util
+matplotlib
+nltk
 pai-easynlp
+pandas
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>=3.19.0,<3.21.0
 pythainlp
 pyvi
-# rough-score was just recently updated from 0.0.4 to 0.0.7
-# which introduced compatability issues that are being investigated
-rouge_score<=0.0.4
+regex
 sacremoses>=0.0.41
+scikit_learn
+sentencepiece
 seqeval
 spacy>=2.3.5
 subword_nmt>=0.3.8
+termcolor
 text2sql_lgesql
 tokenizers
 transformers>=4.12.0
--- a/tests/pipelines/test_mglm_text_summarization.py
+++ b/tests/pipelines/test_mglm_text_summarization.py
@@ -0,0 +1,47 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.preprocessors import MGLMSummarizationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class mGLMTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.output_dir = 'unittest_output'
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_mglm_with_name(self):
+        model = 'ZhipuAI/Multilingual-GLM-Summarization-zh'
+        preprocessor = MGLMSummarizationPreprocessor()
+        pipe = pipeline(
+            task=Tasks.text_summarization,
+            model=model,
+            preprocessor=preprocessor,
+        )
+        result = pipe(
+            '据中国载人航天工程办公室消息，北京时间2022年10月25日，梦天实验舱与长征五号B遥四运载火箭组合体已转运至发射区。后续将按计划开展发射前各项功能检查和联合测试等工作，计划于近日择机实施发射。目前，文昌航天发射场设施设备状态良好，参试各单位正在加紧开展任务准备，全力以赴确保空间站建造任务决战决胜。'  # noqa
+        )
+        print(result)
+
+        model = 'ZhipuAI/Multilingual-GLM-Summarization-en'
+        preprocessor = MGLMSummarizationPreprocessor()
+        pipe = pipeline(
+            task=Tasks.text_summarization,
+            model=model,
+            preprocessor=preprocessor,
+        )
+        result = pipe(
+            '据中国载人航天工程办公室消息，北京时间2022年10月25日，梦天实验舱与长征五号B遥四运载火箭组合体已转运至发射区。后续将按计划开展发射前各项功能检查和联合测试等工作，计划于近日择机实施发射。目前，文昌航天发射场设施设备状态良好，参试各单位正在加紧开展任务准备，全力以赴确保空间站建造任务决战决胜。'  # noqa
+        )
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()