diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index cbab0e0b..4fd985d0 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -32,6 +32,7 @@ class Models(object):
     tcrf = 'transformer-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
+    plug = 'plug'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 24e65ef1..d2b05884 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -20,6 +20,7 @@ if TYPE_CHECKING:
     from .task_models.task_model import SingleBackboneTaskModelBase
     from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .gpt3 import GPT3ForTextGeneration
+    from .plug import PlugForTextGeneration
 
 else:
     _import_structure = {
@@ -42,6 +43,7 @@ else:
         'task_model': ['SingleBackboneTaskModelBase'],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
         'gpt3': ['GPT3ForTextGeneration'],
+        'plug': ['PlugForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py
new file mode 100644
index 00000000..b74258a4
--- /dev/null
+++ b/modelscope/models/nlp/plug/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_plug import PlugNLGConfig
+    from .modeling_plug import PlugModel
+    from .distributed_plug import DistributedPlug
+    from .plug_for_text_generation import PlugForTextGeneration
+else:
+    _import_structure = {
+        'configuration_plug': ['PlugNLGConfig'],
+        'modeling_plug': ['PlugModel'],
+        'distributed_plug': ['DistributedPlug'],
+        'plug_for_text_generation': ['PlugForTextGeneration'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/plug/arguments.py b/modelscope/models/nlp/plug/arguments.py
new file mode 100755
index 00000000..e3a0c152
--- /dev/null
+++ b/modelscope/models/nlp/plug/arguments.py
@@ -0,0 +1,414 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""argparser configuration"""
+
+import argparse
+import os
+import torch
+import deepspeed
+
+
+def add_model_config_args(parser):
+    """Model arguments"""
+
+    group = parser.add_argument_group('model', 'model configuration')
+
+    group.add_argument('--pretrained-bert', action='store_true',
+                       help='use a pretrained bert-large-uncased model instead'
+                       'of initializing from scratch. See '
+                       '--tokenizer-model-type to specify which pretrained '
+                       'BERT model to use')
+    group.add_argument('--attention-dropout', type=float, default=0.1,
+                       help='dropout probability for attention weights')
+    group.add_argument('--num-attention-heads', type=int, default=16,
+                       help='num of transformer attention heads')
+    group.add_argument('--hidden-size', type=int, default=1024,
+                       help='tansformer hidden size')
+    group.add_argument('--intermediate-size', type=int, default=None,
+                       help='transformer embedding dimension for FFN'
+                       'set to 4*`--hidden-size` if it is None')
+    group.add_argument('--num-layers', type=int, default=24,
+                       help='num decoder layers')
+    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
+                       help='layer norm epsilon')
+    group.add_argument('--hidden-dropout', type=float, default=0.1,
+                       help='dropout probability for hidden state transformer')
+    group.add_argument('--max-position-embeddings', type=int, default=512,
+                       help='maximum number of position embeddings to use')
+    group.add_argument('--vocab-size', type=int, default=30522,
+                       help='vocab size to use for non-character-level '
+                       'tokenization. This value will only be used when '
+                       'creating a tokenizer')
+    group.add_argument('--deep-init', action='store_true',
+                       help='initialize bert model similar to gpt2 model.'
+                       'scales initialization of projection layers by a '
+                       'factor of 1/sqrt(2N). Necessary to train bert '
+                       'models larger than BERT-Large.')
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                       'This is added for computational efficieny reasons.')
+    group.add_argument('--cpu-optimizer', action='store_true',
+                                   help='Run optimizer on CPU')
+    group.add_argument('--cpu_torch_adam', action='store_true',
+                                   help='Use Torch Adam as optimizer on CPU.')
+
+    return parser
+
+
+def add_fp16_config_args(parser):
+    """Mixed precision arguments."""
+
+    group = parser.add_argument_group('fp16', 'fp16 configurations')
+
+    group.add_argument('--fp16', action='store_true',
+                       help='Run model in fp16 mode')
+    group.add_argument('--fp32-embedding', action='store_true',
+                       help='embedding in fp32')
+    group.add_argument('--fp32-layernorm', action='store_true',
+                       help='layer norm in fp32')
+    group.add_argument('--fp32-tokentypes', action='store_true',
+                       help='embedding token types in fp32')
+    group.add_argument('--fp32-allreduce', action='store_true',
+                       help='all-reduce in fp32')
+    group.add_argument('--hysteresis', type=int, default=2,
+                       help='hysteresis for dynamic loss scaling')
+    group.add_argument('--loss-scale', type=float, default=None,
+                       help='Static loss scaling, positive power of 2 '
+                       'values can improve fp16 convergence. If None, dynamic'
+                       'loss scaling is used.')
+    group.add_argument('--loss-scale-window', type=float, default=1000,
+                       help='Window over which to raise/lower dynamic scale')
+    group.add_argument('--min-scale', type=float, default=1,
+                       help='Minimum loss scale for dynamic loss scale')
+
+    return parser
+
+
+def add_training_args(parser):
+    """Training arguments."""
+
+    group = parser.add_argument_group('train', 'training configurations')
+
+    group.add_argument('--batch-size', type=int, default=4,
+                       help='Data Loader batch size')
+    group.add_argument('--weight-decay', type=float, default=0.01,
+                       help='weight decay coefficient for L2 regularization')
+    group.add_argument('--checkpoint-activations', action='store_true',
+                       help='checkpoint activation to allow for training '
+                       'with larger models and sequences')
+    group.add_argument('--checkpoint-num-layers', type=int, default=1,
+                       help='chunk size (number of layers) for checkpointing')
+    group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
+                       help='uses activation checkpointing from deepspeed')
+    group.add_argument('--clip-grad', type=float, default=1.0,
+                       help='gradient clipping')
+    group.add_argument('--train-iters', type=int, default=1000000,
+                       help='total number of iterations to train over all training runs')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='report interval')
+    group.add_argument('--exit-interval', type=int, default=None,
+                       help='Exit the program after this many new iterations.')
+
+    group.add_argument('--seed', type=int, default=1234,
+                       help='random seed')
+    # Batch prodecuer arguments
+    group.add_argument('--reset-position-ids', action='store_true',
+                       help='Reset posistion ids after end-of-document token.')
+    group.add_argument('--reset-attention-mask', action='store_true',
+                       help='Reset self attention maske after '
+                       'end-of-document token.')
+
+    # Learning rate.
+    group.add_argument('--lr-decay-iters', type=int, default=None,
+                       help='number of iterations to decay LR over,'
+                       ' If None defaults to `--train-iters`*`--epochs`')
+    group.add_argument('--lr-decay-style', type=str, default='linear',
+                       choices=['constant', 'linear', 'cosine', 'exponential'],
+                       help='learning rate decay function')
+    group.add_argument('--lr', type=float, default=1.0e-4,
+                       help='initial learning rate')
+    group.add_argument('--warmup', type=float, default=0.01,
+                       help='percentage of data to warmup on (.01 = 1% of all '
+                       'training iters). Default 0.01')
+    group.add_argument('--batch-warmup', type=float, default=0.01,
+                       help='percentage of data to warmup on (.01 = 1% of all '
+                       'training iters). Default 0.01')                       
+    group.add_argument('--length-warmup', type=float, default=0.01,
+                       help='percentage of data to warmup on (.01 = 1% of all '
+                       'training iters). Default 0.01')
+    # model checkpointing
+    group.add_argument('--save', type=str, default=None,
+                       help='Output directory to save checkpoints to.')
+    group.add_argument('--save-interval', type=int, default=2000,
+                       help='number of iterations between saves')
+    group.add_argument('--no-save-optim', action='store_true',
+                       help='Do not save current optimizer.')
+    group.add_argument('--no-save-rng', action='store_true',
+                       help='Do not save current rng state.')
+    group.add_argument('--load', type=str, default=None,
+                       help='Path to a directory containing a model checkpoint.')
+    group.add_argument('--load-iteration', type=str, default=0,
+                       help='Load iteration of a model checkpoint.')
+    group.add_argument('--pre-load', action='store_true',
+                       help='Use pre-load instead of deepspeed load.')
+    group.add_argument('--no-load-optim', action='store_true',
+                       help='Do not load optimizer when loading checkpoint.')
+    group.add_argument('--no-load-rng', action='store_true',
+                       help='Do not load rng state when loading checkpoint.')
+    group.add_argument('--no-load-lr', action='store_true',
+                       help='Do not load lr schedule when loading checkpoint.')    
+    group.add_argument('--finetune', action='store_true',
+                       help='Load model for finetuning. Do not load optimizer '
+                       'or rng state from checkpoint and set iteration to 0. '
+                       'Assumed when loading a release checkpoint.')
+    group.add_argument('--resume-dataloader', action='store_true',
+                       help='Resume the dataloader when resuming training. '
+                       'Does not apply to tfrecords dataloader, try resuming'
+                       'with a different seed in this case.')
+    # distributed training args
+    group.add_argument('--distributed-backend', default='nccl',
+                       help='which backend to use for distributed '
+                       'training. One of [gloo, nccl]')
+
+    group.add_argument('--local_rank', type=int, default=None,
+                       help='local rank passed from distributed launcher')
+
+    return parser
+
+
+def add_evaluation_args(parser):
+    """Evaluation arguments."""
+
+    group = parser.add_argument_group('validation', 'validation configurations')
+
+    group.add_argument('--eval-batch-size', type=int, default=None,
+                       help='Data Loader batch size for evaluation datasets.'
+                       'Defaults to `--batch-size`')
+    group.add_argument('--eval-iters', type=int, default=100,
+                       help='number of iterations to run for evaluation'
+                       'validation/test for')
+    group.add_argument('--eval-interval', type=int, default=1000,
+                       help='interval between running evaluation on validation set')
+    group.add_argument('--eval-seq-length', type=int, default=None,
+                       help='Maximum sequence length to process for '
+                       'evaluation. Defaults to `--seq-length`')
+    group.add_argument('--eval-max-preds-per-seq', type=int, default=None,
+                       help='Maximum number of predictions to use for '
+                       'evaluation. Defaults to '
+                       'math.ceil(`--eval-seq-length`*.15/10)*10')
+    group.add_argument('--overlapping-eval', type=int, default=32,
+                       help='sliding window for overlapping eval ')
+    group.add_argument('--cloze-eval', action='store_true',
+                       help='Evaluation dataset from `--valid-data` is a cloze task')
+    group.add_argument('--eval-hf', action='store_true',
+                       help='perform evaluation with huggingface openai model.'
+                       'use `--load` to specify weights path to be loaded')
+    group.add_argument('--load-openai', action='store_true',
+                       help='load openai weights into our model. Use `--load` '
+                       'to specify weights path to be loaded')
+
+    return parser
+
+def add_text_generate_args(parser):
+    """Text generate arguments."""
+
+    group = parser.add_argument_group('Text generation', 'configurations')
+    group.add_argument("--temperature", type=float, default=1.0)
+    group.add_argument("--top_p", type=float, default=0.0)
+    group.add_argument("--top_k", type=int, default=0)
+    group.add_argument("--out-seq-length", type=int, default=256)
+    return parser
+
+def add_struct_args(parser):
+    group = parser.add_argument_group('struct', 'struct configurations')
+    group.add_argument("--gradient-accumulation-steps", type=int, default=1,
+                       help='Not Imp yet.')
+    group.add_argument("--num-epochs", type=int, default=1,
+                       help='Not Imp yet.')
+    group.add_argument("--struct-bert-dataset", action='store_true', default=False,
+                       help='Use struct bert dataset or not.')
+    return parser
+
+def add_palm_args(parser):
+    group = parser.add_argument_group('palm', 'struct configurations')
+    group.add_argument('--dec-layers', type=int, default=6,
+                       help='num decoder layers')
+    group.add_argument('--tgt-length', type=int, default=100,
+                       help='num decoder layers')
+    group.add_argument('--vae-size', type=int, default=8192,
+                       help='vae code vocab size')
+    group.add_argument('--max-image-position', type=int, default=1025,
+                       help='max image decode position')
+    group.add_argument("--palm-dataset", action='store_true', default=False,
+                       help='Use struct bert dataset or not.')
+    group.add_argument("--image-dataset", action='store_true', default=False,
+                       help='Use struct bert dataset or not.')
+    group.add_argument("--do-mask-lm", action='store_true', default=False,
+                       help='Do mask lm task or not.')
+    group.add_argument('--vae-enc-model', type=str, default=None,
+                       help='Path to a directory containing a model checkpoint.')
+    return parser
+
+def add_downstream_args(parser):
+    group = parser.add_argument_group('downstream', 'struct configurations')
+    group.add_argument("--downstream-dataset", action='store_true', default=False,
+                       help='Use struct bert dataset or not.')
+    group.add_argument("--task-name", default='ocnli', type=str)
+    return parser
+
+def add_data_args(parser):
+    """Train/valid/test data arguments."""
+
+    group = parser.add_argument_group('data', 'data configurations')
+
+    group.add_argument('--model-parallel-size', type=int, default=1,
+                       help='size of the model parallel.')
+    group.add_argument('--shuffle', action='store_true',
+                       help='Shuffle data. Shuffling is deterministic '
+                       'based on seed and current epoch.')
+    group.add_argument('--train-data', nargs='+', default=None,
+                       help='Whitespace separated filenames or corpora names '
+                       'for training.')
+
+    group.add_argument('--use-npy-data-loader', action='store_true',
+                       help='Use the numpy data loader. If set, then'
+                       'train-data-path, val-data-path, and test-data-path'
+                       'should also be provided.')
+    group.add_argument('--train-data-path', type=str, default='',
+                       help='path to the training data')
+    group.add_argument('--val-data-path', type=str, default='',
+                       help='path to the validation data')
+    group.add_argument('--test-data-path', type=str, default='',
+                       help='path to the test data')
+    group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
+                       help='the filename containing all the shards sizes')
+
+    group.add_argument('--delim', default=',',
+                       help='delimiter used to parse csv data files')
+    group.add_argument('--text-key', default='sentence',
+                       help='key to use to extract text from json/csv')
+    group.add_argument('--eval-text-key', default=None,
+                       help='key to use to extract text from '
+                       'json/csv evaluation datasets')
+    group.add_argument('--valid-data', nargs='*', default=None,
+                       help="""Filename for validation data.""")
+    group.add_argument('--split', default='1000,1,1',
+                       help='comma-separated list of proportions for training,'
+                       ' validation, and test split')
+    group.add_argument('--test-data', nargs='*', default=None,
+                       help="""Filename for testing""")
+
+    group.add_argument('--lazy-loader', action='store_true',
+                       help='whether to lazy read the data set')
+    group.add_argument('--loose-json', action='store_true',
+                       help='Use loose json (one json-formatted string per '
+                       'newline), instead of tight json (data file is one '
+                       'json string)')
+    group.add_argument('--presplit-sentences', action='store_true',
+                       help='Dataset content consists of documents where '
+                       'each document consists of newline separated sentences')
+    group.add_argument('--num-workers', type=int, default=2,
+                       help="""Number of workers to use for dataloading""")
+    group.add_argument('--tokenizer-model-type', type=str,
+                       default='bert-large-uncased',
+                       help="Model type to use for sentencepiece tokenization \
+                       (one of ['bpe', 'char', 'unigram', 'word']) or \
+                       bert vocab to use for BertWordPieceTokenizer (one of \
+                       ['bert-large-uncased', 'bert-large-cased', etc.])")
+    group.add_argument('--tokenizer-path', type=str, default='tokenizer.model',
+                       help='path used to save/load sentencepiece tokenization '
+                       'models')
+    group.add_argument('--tokenizer-type', type=str,
+                       default='BertWordPieceTokenizer',
+                       choices=['CharacterLevelTokenizer',
+                                'SentencePieceTokenizer',
+                                'BertWordPieceTokenizer',
+                                'GPT2BPETokenizer'],
+                       help='what type of tokenizer to use')
+    group.add_argument("--cache-dir", default=None, type=str,
+                       help="Where to store pre-trained BERT downloads")
+    group.add_argument('--use-tfrecords', action='store_true',
+                       help='load `--train-data`, `--valid-data`, '
+                       '`--test-data` from BERT tf records instead of '
+                       'normal data pipeline')
+    group.add_argument('--seq-length', type=int, default=512,
+                       help="Maximum sequence length to process")
+    group.add_argument('--max-preds-per-seq', type=int, default=None,
+                       help='Maximum number of predictions to use per sequence.'
+                       'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
+                       'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
+
+    return parser
+
+def get_args():
+    """Parse all the args."""
+
+    parser = argparse.ArgumentParser(description='PyTorch BERT Model')
+    parser = add_model_config_args(parser)
+    parser = add_fp16_config_args(parser)
+    parser = add_training_args(parser)
+    parser = add_evaluation_args(parser)
+    parser = add_text_generate_args(parser)
+    parser = add_struct_args(parser)
+    parser = add_palm_args(parser)
+    parser = add_downstream_args(parser)
+    parser = add_data_args(parser)
+
+    # Include DeepSpeed configuration arguments
+    parser = deepspeed.add_config_arguments(parser)
+
+    args = parser.parse_args()
+
+    args.deepspeed = False
+
+    args.cuda = torch.cuda.is_available()
+
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
+        # We are using (OpenMPI) mpirun for launching distributed data parallel processes
+        local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
+        local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+
+        # Possibly running with Slurm
+        num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
+        nodeid = int(os.getenv('SLURM_NODEID', '0'))
+
+        args.local_rank = local_rank
+        args.rank = nodeid*local_size + local_rank
+        args.world_size = num_nodes*local_size
+
+    args.model_parallel_size = min(args.model_parallel_size, args.world_size)
+    if args.rank == 0:
+        print('using world size: {} and model-parallel size: {} '.format(
+            args.world_size, args.model_parallel_size))
+
+    args.dynamic_loss_scale = False
+    if args.loss_scale is None:
+        args.dynamic_loss_scale = True
+        if args.rank == 0:
+            print(' > using dynamic loss scaling')
+
+    # The args fp32_* or fp16_* meant to be active when the
+    # args fp16 is set. So the default behaviour should all
+    # be false.
+    if not args.fp16:
+        args.fp32_embedding = False
+        args.fp32_tokentypes = False
+        args.fp32_layernorm = False
+
+    return args
diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration_plug.py
new file mode 100644
index 00000000..c05ff127
--- /dev/null
+++ b/modelscope/models/nlp/plug/configuration_plug.py
@@ -0,0 +1,368 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import copy
+
+""" BERT model configuration """
+from collections import OrderedDict
+from typing import Mapping
+
+from transformers import PretrainedConfig
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PlugNLUConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
+    :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        classifier_dropout (:obj:`float`, `optional`):
+            The dropout ratio for the classification head.
+
+    Examples::
+
+        >>> from transformers import BertModel, BertConfig
+
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type="plugNLU"
+
+    def __init__(
+        self,
+        vocab_size=21504,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act='gelu',
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.00707,
+        deep_init=False,
+        deepspeed=False,
+        lr_decay_style='linear',
+        weight_decay=1e-2,
+        clip_grad=1.0,
+        warmup=0.01,
+        pre_ln = False,
+        fp16 = False,
+        fp32_layernorm=False,
+        fp32_embedding=False,
+        fp32_tokentypes=False,
+        layernorm_epsilon=1e-12,
+        dec_hidden_layers=6,
+        pruning_method=None,
+        pruning_mask_init="constant",
+        pruning_mask_scale=0.0,
+        pruning_initial_threshold= 1.0,
+        pruning_final_threshold=0.01,
+        pruning_initial_warmup=1,
+        pruning_final_warmup=20,
+        pruning_module='decoder',
+        pruning_decay_step=50,
+        pruning_decay_type='exp',
+        ft_module=None,
+        attn_separate=True,
+        LR_weight_rank=8,
+        LR_mask_rank=8,
+        **kwargs
+    ):
+        super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.deep_init = deep_init
+        self.deepspeed = deepspeed
+        self.lr_decay_style = lr_decay_style
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.warmup = warmup
+        self.pre_ln = pre_ln
+        self.fp16 = fp16
+        self.fp32_layernorm = fp32_layernorm
+        self.fp32_embedding = fp32_embedding
+        self.layernorm_epsilon = layernorm_epsilon
+        self.fp32_tokentypes = fp32_tokentypes
+        self.dec_hidden_layers = dec_hidden_layers
+        self.pruning_method = pruning_method
+        self.pruning_mask_init = pruning_mask_init
+        self.pruning_mask_scale = pruning_mask_scale
+        self.pruning_module = pruning_module
+        self.pruning_initial_threshold = pruning_initial_threshold
+        self.pruning_final_threshold = pruning_final_threshold
+        self.pruning_initial_warmup = pruning_initial_warmup
+        self.pruning_final_warmup = pruning_final_warmup
+        self.pruning_decay_step = pruning_decay_step
+        self.pruning_decay_type = pruning_decay_type
+        self.ft_module = ft_module
+        self.attn_separate = attn_separate
+        self.LR_weight_rank = LR_weight_rank
+        self.LR_mask_rank = LR_mask_rank
+     
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = PlugNLUConfig()
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def merge_args(self, args):
+        """merge values a `BertConfig` from a json file of parameters."""
+        local_keys = self.__dict__.keys()
+        for key, value in args.__dict__.items():
+            if key in local_keys:
+                continue
+            self.__dict__[key] = value
+        return self
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+class PlugNLGConfig(PlugNLUConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
+    :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        classifier_dropout (:obj:`float`, `optional`):
+            The dropout ratio for the classification head.
+
+    Examples::
+
+        >>> from transformers import BertModel, BertConfig
+
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type="plugNLG"
+
+    def __init__(
+        self,
+        vocab_size=21504,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act='gelu',
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.00707,
+        deep_init=False,
+        deepspeed=False,
+        lr_decay_style='linear',
+        weight_decay=1e-2,
+        clip_grad=1.0,
+        warmup=0.01,
+        pre_ln = False,
+        fp16 = False,
+        fp32_layernorm=False,
+        fp32_embedding=False,
+        fp32_tokentypes=False,
+        layernorm_epsilon=1e-12,
+        dec_hidden_layers=6,
+        pruning_method=None,
+        pruning_mask_init="constant",
+        pruning_mask_scale=0.0,
+        pruning_initial_threshold= 1.0,
+        pruning_final_threshold=0.01,
+        pruning_initial_warmup=1,
+        pruning_final_warmup=20,
+        pruning_module='decoder',
+        pruning_decay_step=50,
+        pruning_decay_type='exp',
+        ft_module=None,
+        attn_separate=False,
+        LR_weight_rank=8,
+        LR_mask_rank=8,
+        **kwargs
+    ):
+        super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.deep_init = deep_init
+        self.deepspeed = deepspeed
+        self.lr_decay_style = lr_decay_style
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.warmup = warmup
+        self.pre_ln = pre_ln
+        self.fp16 = fp16
+        self.fp32_layernorm = fp32_layernorm
+        self.fp32_embedding = fp32_embedding
+        self.layernorm_epsilon = layernorm_epsilon
+        self.fp32_tokentypes = fp32_tokentypes
+        self.dec_hidden_layers = dec_hidden_layers
+        self.pruning_method = pruning_method
+        self.pruning_mask_init = pruning_mask_init
+        self.pruning_mask_scale = pruning_mask_scale
+        self.pruning_module = pruning_module
+        self.pruning_initial_threshold = pruning_initial_threshold
+        self.pruning_final_threshold = pruning_final_threshold
+        self.pruning_initial_warmup = pruning_initial_warmup
+        self.pruning_final_warmup = pruning_final_warmup
+        self.pruning_decay_step = pruning_decay_step
+        self.pruning_decay_type = pruning_decay_type
+        self.ft_module = ft_module
+        self.attn_separate = attn_separate
+        self.LR_weight_rank = LR_weight_rank
+        self.LR_mask_rank = LR_mask_rank
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
new file mode 100644
index 00000000..ae6baf68
--- /dev/null
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -0,0 +1,198 @@
+import random
+import torch
+import numpy as np
+import torch.nn.functional as F
+from typing import Dict
+
+from . import PlugModel
+from modelscope.models.base import Tensor
+from modelscope.utils.nlp import mpu
+from modelscope.utils.nlp.utils import print_rank_0
+from modelscope.utils.nlp.fp16 import FP16_Module
+from modelscope.utils.nlp.distributed import DistributedDataParallel as DDP
+
+import os
+from modelscope.utils.torch_utils import init_dist
+def initialize_distributed(rank):
+    """Initialize torch.distributed."""
+    # Manually set the device ids.
+    #torch.multiprocessing.set_start_method("spawn")
+    device = rank % torch.cuda.device_count()
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', '127.0.0.1')
+    master_port = os.getenv('MASTER_PORT', '12345')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=8, rank=rank,
+        init_method=init_method)
+    # Set the model-parallel communicators.
+    mpu.initialize_model_parallel(8)
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    # This function has been mostly taken from huggingface conversational ai code at
+    # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+        
+    if top_p > 0.0:
+        #convert to 1D
+        logits=logits.view(logits.size()[1]).contiguous()
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+        #going back to 2D
+        logits=logits.view(1, -1).contiguous()
+	
+    return logits
+
+
+
+class DistributedPlug:
+    @classmethod
+    def init(cls, rank, model_dir, model_config, args):
+    #def init(cls, rank):
+        #torch.backends.cudnn.enabled = False
+        #
+        cls.rank = rank
+        cls.args = args
+        cls.config = model_config
+        cls.model_dir = model_dir
+        initialize_distributed(rank)
+        cls.set_random_seed(cls, args.seed)
+        cls.setup_model(cls, path_load_tag='model')
+
+    def set_random_seed(cls, seed):
+        if seed is not None and seed > 0:
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+            mpu.model_parallel_cuda_manual_seed(seed)
+
+    def get_model(cls):
+        """Build the model."""
+
+        print_rank_0('Building Plug model. It will take a few minutes ...')
+        model = PlugModel(cls.config)
+
+        if mpu.get_data_parallel_rank() == 0:
+            print(' > number of parameters on model parallel rank {}: {}'.format(
+                mpu.get_model_parallel_rank(),
+                sum([p.nelement() for p in model.parameters()])), flush=True)
+
+        if cls.args.deepspeed and cls.args.fp16:
+            model.half()   
+
+        # GPU allocation.
+        model.cuda(torch.cuda.current_device())
+
+        # Fp16 conversion.
+        if cls.args.fp16:
+            model = FP16_Module(model)
+            if cls.args.fp32_embedding:
+                model.module.model.bert.embeddings.word_embeddings.float()
+                model.module.model.bert.embeddings.position_embeddings.float()
+                model.module.model.bert.embeddings.token_type_embeddings.float()
+            if cls.args.fp32_tokentypes:
+                model.module.model.bert.embeddings.token_type_embeddings.float()
+            if cls.args.fp32_layernorm:
+                for name, _module in model.named_modules():
+                    if 'LayerNorm' in name:
+                        _module.float()
+
+        # model = DDP(model)
+
+        return model
+
+    def setup_model(cls, path_load_tag='model'):
+        dist_model = cls.get_model(cls)
+        if cls.model_dir is not None:
+            from modelscope.utils.nlp.load_checkpoint import pre_load
+            load_model = pre_load(mpu, cls.model_dir, tag=path_load_tag)
+            # model_dict = dist_model.module.module.model.state_dict()
+            model_dict = dist_model.module.model.state_dict()
+            for key in load_model:
+                if key not in model_dict.keys():
+                    print_rank_0('Skip key: '+key)
+                else:
+                    print_rank_0('Loading key: '+key)
+            # dist_model.module.module.model.load_state_dict(load_model, strict=False)
+            dist_model.module.model.load_state_dict(load_model, strict=False)
+        cls.args.iteration = 0
+        cls.dist_model = dist_model
+
+    @classmethod
+    def forward(cls, input:Dict[str, Tensor]):
+        device = torch.cuda.current_device()
+        tokens = input["input_ids"].to(device)
+        dec_input_ids = input["dec_input_ids"].to(device)
+        attention_mask = input["attention_mask"].to(device)
+        cls.dist_model.eval()
+        seq_length = 128
+        with torch.no_grad():
+            all_generate_tokens = []
+            generate_tokens = []
+            counter = 0
+            sequence_output = None
+            vocab_size = 21128
+            #tokens, attention_mask, types, dec_input_ids = get_batch(context_tokens_tensor, device, args)
+            while counter < seq_length:
+                # if counter % 128 == 0 and counter != 0:
+                #    generate_tokens.append(tokenizer.vocab[args.sep_token])
+                #    start = (context_tokens_tensor == 102).nonzero(as_tuple=True)[-1]
+                #    if start + len(generate_tokens) >= 512:
+                #        context_tokens_tensor = torch.cat([context_tokens_tensor[:start], torch.cuda.LongTensor(generate_tokens)], -1)[-512:]
+                #    else:
+                #        context_tokens_tensor[start:start+len(generate_tokens)] = torch.cuda.LongTensor(generate_tokens)
+                #    tokens, attention_mask, types, dec_input_ids = get_batch(context_tokens_tensor, device, args)
+                #    generate_tokens = []
+                #    sequence_output = None
+
+                position_ids = torch.full([cls.args.batch_size, 1], len(generate_tokens), dtype=torch.long, device=device)
+                _, logits, sequence_output = cls.dist_model(tokens, None, attention_mask, dec_input_ids, attention_mask, position_ids, is_infer=True, sequence_output=sequence_output, parallel_output=False)
+
+                partition_vocab_size = logits.size()[-1]
+
+                logits = logits[:, -1, :]
+                logits = logits / cls.args.temperature
+                logits = top_k_logits(logits, top_k=cls.args.top_k, top_p=cls.args.top_p)
+                log_probs = F.softmax(logits, dim=-1)
+                prev = torch.multinomial(log_probs, num_samples=1)
+                prev_token = prev[0].item()
+                if prev_token >= vocab_size: #or prev_token == 102:
+                    prev_token = 100
+                    prev[0] = 100
+                # if prev_token == 102 and len(all_generate_tokens) > int(max(1, length) * 0.8):
+                if prev_token == 102:
+                    break
+                #if prev_token == 102:
+                #    counter += 1
+                #    continue
+                #if prev_token == 100:
+                #    counter += 1
+                #    continue
+                dec_input_ids = torch.cat([dec_input_ids, prev], dim=1)
+                generate_tokens.append(prev_token)
+                all_generate_tokens.append(prev_token)
+                counter += 1
+
+            generate_context = []
+            for token in all_generate_tokens:
+                if generate_context and generate_context[-1] == 100 and token == 100:
+                    continue
+                else:
+                    generate_context.append(token)
+            return {"generate_context": generate_context}
+
diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/modeling_plug.py
new file mode 100644
index 00000000..04d5fa7a
--- /dev/null
+++ b/modelscope/models/nlp/plug/modeling_plug.py
@@ -0,0 +1,1027 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import copy
+import json
+import math
+import logging
+import tarfile
+import tempfile
+import shutil
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+
+from .configuration_plug import PlugNLUConfig, PlugNLGConfig
+from ....utils.nlp import mpu#, cached_path
+import copy
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+
+def normal_init_method(mean, std):
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+    return init_
+
+def scaled_init_method(mean, std, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = std / math.sqrt(2.0 * num_layers)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+}
+CONFIG_NAME = 'bert_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+class BertLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size,
+            init_method=normal_init_method(mean=0.0,
+                                           std=config.initializer_range))
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.fp32_layernorm = config.fp32_layernorm
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_tokentypes = config.fp32_tokentypes
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        if not self.fp32_tokentypes:
+
+            embeddings = words_embeddings + position_embeddings + token_type_embeddings
+            if self.fp32_embedding and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_embedding:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        else:
+            embeddings = words_embeddings.float() + position_embeddings.float() + token_type_embeddings.float()    
+            if self.fp32_tokentypes and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_tokentypes:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(mean=0.0,
+                                             std=config.initializer_range,
+                                             num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(mean=0.0,
+                                             std=config.initializer_range)
+        self.dense = mpu.RowParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            stride=1,
+            init_method=init_method,
+            pruning_method=config.pruning_method if config.pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfvo', 'encoder_selfo'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.fp32_layernorm = config.fp32_layernorm
+        if not config.pre_ln:
+            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor, pruning_threshold=None,):
+        hidden_states = self.dense(hidden_states,  pruning_threshold=pruning_threshold,)
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        if self.LayerNorm is not None:
+            previous_type = ln_input.type()
+            if self.fp32_layernorm:
+                ln_input = ln_input.float()
+            hidden_states = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                hidden_states = hidden_states.type(previous_type)
+        else:
+            hidden_states = ln_input
+        return hidden_states
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.self = mpu.BertParallelSelfAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            dropout_prob=config.attention_probs_dropout_prob,
+            output_parallel=True,
+            init_method=normal_init_method(mean=0.0,
+                                           std=config.initializer_range),
+            separate=config.attn_separate,
+            pruning_method=config.pruning_method,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            pruning_module=config.pruning_module,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask, pruning_threshold=None,):
+        if self.LayerNorm is not None:
+            ln_input = input_tensor
+            previous_type = input_tensor.type()
+            if self.fp32_layernorm:
+                ln_input = input_tensor.float()
+            ln_output = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                ln_output = ln_output.type(previous_type)
+            self_output = self.self(ln_output, attention_mask, pruning_threshold=pruning_threshold,)
+        else:
+            self_output = self.self(input_tensor, attention_mask,  pruning_threshold=pruning_threshold,)
+        # output_pruning_threshold = 1 - (1 - pruning_threshold)/0.99*0.95
+        output_pruning_threshold = pruning_threshold
+
+        attention_output = self.output(self_output, input_tensor,  pruning_threshold=output_pruning_threshold,)
+        return attention_output
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = mpu.ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.intermediate_size,
+            bias=True,
+            gather_output=False,
+            stride=1,
+            init_method=normal_init_method(mean=0.0, std=config.initializer_range),
+            pruning_method=config.pruning_method if config.pruning_module in ['all', 'encoder', 'encoder_ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(self, hidden_states, pruning_threshold=None,):
+        hidden_states = self.dense(hidden_states, pruning_threshold=pruning_threshold,)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        if hasattr(config, 'deep_init') and  config.deep_init:
+            init_method = scaled_init_method(mean=0.0,
+                                             std=config.initializer_range,
+                                             num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(mean=0.0,
+                                             std=config.initializer_range)
+        self.dense = mpu.RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            stride=1,
+            init_method=init_method,
+            pruning_method=config.pruning_method if config.pruning_module in ['all', 'encoder', 'encoder_ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.fp32_layernorm = config.fp32_layernorm
+        if not config.pre_ln:
+            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor, pruning_threshold=None,):
+        hidden_states = self.dense(hidden_states, pruning_threshold=pruning_threshold,)
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        if self.LayerNorm is not None: 
+            previous_type = ln_input.type()
+            if self.fp32_layernorm:
+                ln_input = ln_input.float()
+            hidden_states = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                hidden_states = hidden_states.type(previous_type)
+        else:
+            hidden_states = ln_input
+        return hidden_states
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+
+    def forward(self, hidden_states, attention_mask, pruning_threshold=None,):
+        attention_output = self.attention(hidden_states, attention_mask, pruning_threshold=pruning_threshold)
+        if self.LayerNorm is not None:
+            ln_input = attention_output
+            previous_type = attention_output.type()
+            if self.fp32_layernorm:
+                ln_input = attention_output.float()
+            ln_output = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                ln_output = ln_output.type(previous_type)
+            intermediate_output = self.intermediate(ln_output, pruning_threshold=pruning_threshold)
+        else:
+            intermediate_output = self.intermediate(attention_output, pruning_threshold=pruning_threshold)
+        layer_output = self.output(intermediate_output, attention_output, pruning_threshold=pruning_threshold)
+        return layer_output
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False, detach_index=-1, pruning_threshold=None,):
+        all_encoder_layers = []
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1], pruning_threshold=pruning_threshold)
+                return x_
+            return custom_forward
+
+        if checkpoint_activations:
+            l = 0
+            num_layers = len(self.layer)
+            chunk_length = 1 #math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = mpu.checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1)
+                if detach_index == l:
+                    hidden_states.detach_()
+                l += chunk_length
+            # decoder layers
+        else:
+            for i,layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+                if detach_index == i:
+                    hidden_states.detach_()
+                if i == len(self.layer) - 1 and self.LayerNorm is not None:
+                    previous_type = hidden_states.type()
+                    if self.fp32_layernorm:
+                        hidden_states = hidden_states.float()
+                    hidden_states = self.LayerNorm(hidden_states)
+                    if self.fp32_layernorm:
+                        hidden_states = hidden_states.type(previous_type)
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            if self.LayerNorm is not None:
+                previous_type = hidden_states.type()
+                if self.fp32_layernorm:
+                    hidden_states = hidden_states.float()
+                hidden_states = self.LayerNorm(hidden_states)
+                if self.fp32_layernorm:
+                    hidden_states = hidden_states.type(previous_type)
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.LayerNorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        #self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+        #                         bert_model_embedding_weights.size(0),
+        #                         bias=False)
+        self.decoder_weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+        self.bias.model_parallel = True
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_layernorm = config.fp32_layernorm
+        def convert_to_type(tensor):
+            if self.fp32_embedding:
+                return tensor.half()
+            else:
+                return tensor
+        self.type_converter = convert_to_type
+        self.converted = False
+        self.timers = SynchronizedWallClockTimer()
+
+    def forward(self, hidden_states):
+        if not self.converted:
+            self.converted = True
+            if self.fp32_embedding:
+                self.transform.half()
+                if self.fp32_layernorm:
+                    self.transform.LayerNorm.float()
+        hidden_states = self.transform(self.type_converter(hidden_states))
+        # hidden_states = self.decoder(hidden_states) + self.bias
+        self.timers('final linear gather').start()
+        hidden_states = mpu.copy_to_model_parallel_region(hidden_states)
+        self.timers('final linear gather').stop()
+        hidden_states = F.linear(self.type_converter(hidden_states),
+                                 self.type_converter(self.decoder_weight),
+                                 self.type_converter(self.bias))
+        #self.timers.log(names=['final linear gather']) 
+        return hidden_states
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 3)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        for p in self.seq_relationship.parameters():
+            if p is None:
+                continue
+            pooled_output = pooled_output.type_as(p)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, PlugNLUConfig) and not isinstance(config, PlugNLGConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    #@classmethod
+    #def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None,
+    #                    fp32_layernorm=False, fp32_embedding=False, layernorm_epsilon=1e-12,
+    #                    fp32_tokentypes=False, *inputs, **kwargs):
+    #    """
+    #    Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
+    #    Download and cache the pre-trained model file if needed.
+
+    #    Params:
+    #        pretrained_model_name: either:
+    #            - a str with the name of a pre-trained model to load selected in the list of:
+    #                . `bert-base-uncased`
+    #                . `bert-large-uncased`
+    #                . `bert-base-cased`
+    #                . `bert-large-cased`
+    #                . `bert-base-multilingual-uncased`
+    #                . `bert-base-multilingual-cased`
+    #                . `bert-base-chinese`
+    #            - a path or url to a pretrained model archive containing:
+    #                . `bert_config.json` a configuration file for the model
+    #                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+    #        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+    #        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+    #        *inputs, **kwargs: additional input for the specific Bert class
+    #            (ex: num_labels for BertForSequenceClassification)
+    #    """
+    #    if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
+    #        archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+    #    else:
+    #        archive_file = pretrained_model_name
+    #    # redirect to the cache, if necessary
+    #    try:
+    #        resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+    #    except FileNotFoundError:
+    #        logger.error(
+    #            "Model name '{}' was not found in model name list ({}). "
+    #            "We assumed '{}' was a path or url but couldn't find any file "
+    #            "associated to this path or url.".format(
+    #                pretrained_model_name,
+    #                ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+    #                archive_file))
+    #        return None
+    #    if resolved_archive_file == archive_file:
+    #        logger.info("loading archive file {}".format(archive_file))
+    #    else:
+    #        logger.info("loading archive file {} from cache at {}".format(
+    #            archive_file, resolved_archive_file))
+    #    tempdir = None
+    #    if os.path.isdir(resolved_archive_file):
+    #        serialization_dir = resolved_archive_file
+    #    else:
+    #        # Extract archive to temp dir
+    #        tempdir = tempfile.mkdtemp()
+    #        logger.info("extracting archive file {} to temp dir {}".format(
+    #            resolved_archive_file, tempdir))
+    #        with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+    #            archive.extractall(tempdir)
+    #        serialization_dir = tempdir
+    #    # Load config
+    #    config_file = os.path.join(serialization_dir, CONFIG_NAME)
+    #    config = PlugNLUConfig.from_json_file(config_file)
+    #    config.fp32_layernorm = fp32_layernorm
+    #    config.fp32_embedding = fp32_embedding
+    #    config.layernorm_epsilon = layernorm_epsilon
+    #    config.fp32_tokentypes = fp32_tokentypes
+    #    logger.info("Model config {}".format(config))
+    #    # Instantiate model.
+    #    model = cls(config, *inputs, **kwargs)
+    #    if state_dict is None:
+    #        weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+    #        state_dict = torch.load(weights_path)
+
+    #    old_keys = []
+    #    new_keys = []
+    #    for key in state_dict.keys():
+    #        new_key = None
+    #        if 'gamma' in key:
+    #            new_key = key.replace('gamma', 'weight')
+    #        if 'beta' in key:
+    #            new_key = key.replace('beta', 'bias')
+    #        if new_key:
+    #            old_keys.append(key)
+    #            new_keys.append(new_key)
+    #    for old_key, new_key in zip(old_keys, new_keys):
+    #        state_dict[new_key] = state_dict.pop(old_key)
+
+    #    missing_keys = []
+    #    unexpected_keys = []
+    #    error_msgs = []
+    #    # copy state_dict so _load_from_state_dict can modify it
+    #    metadata = getattr(state_dict, '_metadata', None)
+    #    state_dict = state_dict.copy()
+    #    if metadata is not None:
+    #        state_dict._metadata = metadata
+
+    #    def load(module, prefix=''):
+    #        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+    #        module._load_from_state_dict(
+    #            state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+    #        for name, child in module._modules.items():
+    #            if child is not None:
+    #                load(child, prefix + name + '.')
+    #    load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+    #    if len(missing_keys) > 0:
+    #        logger.info("Weights of {} not initialized from pretrained model: {}".format(
+    #            model.__class__.__name__, missing_keys))
+    #    if len(unexpected_keys) > 0:
+    #        logger.info("Weights from pretrained model not used in {}: {}".format(
+    #            model.__class__.__name__, unexpected_keys))
+    #    if tempdir:
+    #        # Clean up temp dir
+    #        shutil.rmtree(tempdir)
+    #    return model
+
+class BertModel(PreTrainedBertModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, checkpoint_activations=False, detach_index=-1, pruning_threshold=None,):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.encoder.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      checkpoint_activations=checkpoint_activations,
+                                      detach_index=detach_index,
+                                      pruning_threshold=pruning_threshold)
+        sequence_output = encoded_layers[-1]
+        for p in self.pooler.parameters():
+            if p is None:
+                continue
+            sequence_output = sequence_output.type_as(p)
+            break
+        #pooled_output = self.pooler(sequence_output)
+        pooled_output = sequence_output[:, 0]
+        if not output_all_encoded_layers or checkpoint_activations:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class DecodeLayer(nn.Module):
+    def __init__(self, config):
+        super(DecodeLayer, self).__init__()
+        init_method = normal_init_method(mean=0.0,std=config.initializer_range)
+        output_layer_init_method = scaled_init_method(mean=0.0,
+                                             std=config.initializer_range,
+                                             num_layers=config.num_hidden_layers)
+        
+        self_pruning_method = config.pruning_method
+        cross_pruning_method = config.pruning_method
+        ffn_pruning_method = config.pruning_method
+
+        if config.ft_module is not None:
+            if 'decoder_self' in config.ft_module:
+                self_pruning_method = 'finetune'
+            if 'decoder_cross' in config.ft_module:
+                cross_pruning_method = 'finetune'
+            if 'decoder_ffn' in config.ft_module:
+                ffn_pruning_method = 'finetune'
+
+        self.attention = mpu.GPT2ParallelSelfAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            attention_dropout_prob=config.attention_probs_dropout_prob,
+            output_dropout_prob=config.hidden_dropout_prob,
+            init_method=init_method,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=self_pruning_method if config.pruning_module in ['all', 'decoder', 'decoder_self', 'decoder_self+ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init, pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+            )
+
+        self.cross_attention = mpu.PalmParallelCrossAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            attention_dropout_prob=config.attention_probs_dropout_prob,
+            output_dropout_prob=config.hidden_dropout_prob,
+            init_method=init_method, attn_separate=False,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=cross_pruning_method, pruning_mask_init=config.pruning_mask_init, 
+            pruning_mask_scale=config.pruning_mask_scale, pruning_module=config.pruning_module,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,)
+        
+        self.input_layernorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_cross_attention_layernorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+
+        self.intermediate = mpu.ColumnParallelLinear(config.hidden_size, config.intermediate_size, gather_output=False, init_method=init_method,
+                                                     pruning_method=ffn_pruning_method if config.pruning_module in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None, 
+                                                     pruning_mask_init=config.pruning_mask_init, pruning_mask_scale=config.pruning_mask_scale,
+                                                     LR_weight_rank=config.LR_weight_rank, LR_mask_rank=config.LR_mask_rank,)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.output = mpu.RowParallelLinear(config.intermediate_size, config.hidden_size, input_is_parallel=True, init_method=output_layer_init_method,
+                                            pruning_method=ffn_pruning_method if config.pruning_module in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,  
+                                            pruning_mask_init=config.pruning_mask_init, pruning_mask_scale=config.pruning_mask_scale, 
+                                            LR_weight_rank=config.LR_weight_rank, LR_mask_rank=config.LR_mask_rank,)
+        
+        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        self.fp32_layernorm = config.fp32_layernorm
+        def convert_to_type(tensor):
+            if self.fp32_layernorm:
+                return tensor.float()
+            else:
+                return tensor
+        self.type_converter = convert_to_type
+        
+
+    #def forward(self, hidden_states, enc_attn_mask, dec_attn_mask):
+    def forward(self, hidden_states, enc_hidden_states, enc_attn_mask, dec_attn_mask, is_infer=False, pruning_threshold=None):
+        residual = hidden_states
+        previous_type = hidden_states.type()
+        hidden_states = self.input_layernorm(self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.attention(hidden_states, dec_attn_mask, is_infer=is_infer, pruning_threshold=pruning_threshold)
+        # add dropout?
+        # hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states     
+        hidden_states = self.post_attention_layernorm(self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            # same to the output of BertAttention
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.cross_attention(hidden_states, enc_hidden_states, enc_attn_mask, pruning_threshold=pruning_threshold)
+        # hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_cross_attention_layernorm(self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.intermediate(hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        # hidden_states = self.dropout(hidden_states)
+
+        hidden_states = self.output(hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        
+        return hidden_states
+
+class BertDecoder(nn.Module):
+    def __init__(self, config):
+        super(BertDecoder, self).__init__()
+        self.layer = nn.ModuleList([DecodeLayer(config) for _ in range(config.dec_hidden_layers)])
+        
+        self.final_layernorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self, hidden_states, enc_hidden_states, enc_attn_mask, dec_attn_mask, checkpoint_activations=False, output_all_encoded_layers=False, is_infer=False, pruning_threshold=None):
+        all_encoder_layers = []
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1], inputs[2], dec_attn_mask*1, is_infer=is_infer, pruning_threshold=pruning_threshold)
+                return x_
+            return custom_forward
+
+        pre_enc_hidden= enc_hidden_states.data
+        if checkpoint_activations:
+            l = 0
+            num_layers = len(self.layer)
+            chunk_length = 1 #math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = mpu.checkpoint(custom(l, l+chunk_length), hidden_states, enc_hidden_states, enc_attn_mask*1)
+                enc_hidden_states.data = pre_enc_hidden
+                l += chunk_length
+        else:
+            for i,layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, enc_hidden_states, enc_attn_mask, dec_attn_mask, is_infer=is_infer, pruning_threshold=pruning_threshold)
+        
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.final_layernorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        
+        return [hidden_states]
+
+class DecodeModel(PreTrainedBertModel):
+
+    def __init__(self, config):
+        super(DecodeModel, self).__init__(config)
+        self.decoder = BertDecoder(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, embeddings, sequence_output, decode_input_ids, position_ids=None, enc_attn_mask=None, dec_attn_mask=None, checkpoint_activations=False, is_infer=False, pruning_threshold=None):
+
+        extended_attention_mask = enc_attn_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.decoder.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = embeddings(decode_input_ids)
+        sequence_output = self.decoder(embedding_output,
+                                      sequence_output,
+                                      extended_attention_mask,
+                                      dec_attn_mask,
+                                      checkpoint_activations=False,
+                                      is_infer=is_infer,
+                                      pruning_threshold=pruning_threshold)
+        return sequence_output[-1]
+
+class PalmForPreTraining(PreTrainedBertModel):
+    def __init__(self, config):
+        super(PalmForPreTraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.decoder = DecodeModel(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, decode_input_ids=None, position_ids=None, decode_attention_mask=None, lm_labels=None, checkpoint_activations=False, is_infer=False, sequence_output=None, parallel_output=True, pruning_threshold=None):
+        if sequence_output is None:
+            sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+                                                   output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations, pruning_threshold=pruning_threshold)
+            prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        else:
+            prediction_scores = None
+            seq_relationship_score = None
+            sequence_output = sequence_output.to(dtype=next(self.decoder.parameters()).dtype)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        decode_output = self.decoder(self.bert.embeddings, sequence_output, decode_input_ids, position_ids, attention_mask, decode_attention_mask, checkpoint_activations=checkpoint_activations, is_infer=is_infer, pruning_threshold=pruning_threshold)
+
+        #prediction_scores = self.cls(decode_output)
+        
+        transformer_output_parallel = mpu.copy_to_model_parallel_region(
+            decode_output)
+
+        logits_parallel = F.linear(transformer_output_parallel,
+                                   self.bert.embeddings.word_embeddings.weight)
+        
+        if parallel_output:
+            return prediction_scores, logits_parallel
+        if is_infer:
+            return prediction_scores, mpu.gather_from_model_parallel_region(logits_parallel), sequence_output
+        return prediction_scores, mpu.gather_from_model_parallel_region(logits_parallel)
+
+class PlugModel(torch.nn.Module):
+
+    def __init__(self, config):
+        super(PlugModel, self).__init__()
+        if config.intermediate_size is None:
+            intermediate_size = 4 * config.hidden_size
+        else:
+            intermediate_size = config.intermediate_size
+        self.config = config
+        # self.config = BertConfig(
+        #     args.tokenizer_num_tokens,
+        #     hidden_size=args.hidden_size,
+        #     num_hidden_layers=args.num_layers,
+        #     num_attention_heads=args.num_attention_heads,
+        #     intermediate_size=intermediate_size,
+        #     hidden_dropout_prob=args.hidden_dropout,
+        #     attention_probs_dropout_prob=args.attention_dropout,
+        #     max_position_embeddings=args.max_position_embeddings,
+        #     type_vocab_size=args.tokenizer_num_type_tokens,
+        #     fp32_layernorm=args.fp32_layernorm,
+        #     fp32_embedding=args.fp32_embedding,
+        #     fp32_tokentypes=args.fp32_tokentypes,
+        #     layernorm_epsilon=args.layernorm_epsilon,
+        #     deep_init=args.deep_init,
+        #     dec_hidden_layers=args.dec_layers)
+        self.model = PalmForPreTraining(self.config)
+
+    def forward(self, input_tokens, token_type_ids=None,
+                attention_mask=None, target_tokens=None, position_ids=None, decode_attention_mask=None, checkpoint_activations=False, is_infer=False, sequence_output=None, parallel_output=True):
+        return self.model(
+            input_tokens, token_type_ids, attention_mask, target_tokens, position_ids, 
+            decode_attention_mask, checkpoint_activations=checkpoint_activations, is_infer=is_infer, sequence_output=sequence_output, parallel_output=parallel_output)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.model.state_dict(destination=destination, prefix=prefix,
+                                     keep_vars=keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
+
+
diff --git a/modelscope/models/nlp/plug/plug_for_text_generation.py b/modelscope/models/nlp/plug/plug_for_text_generation.py
new file mode 100644
index 00000000..2875ebeb
--- /dev/null
+++ b/modelscope/models/nlp/plug/plug_for_text_generation.py
@@ -0,0 +1,57 @@
+import torch
+from typing import Dict
+from functools import partial
+
+from . import DistributedPlug
+from ...base import Tensor, TorchModel
+from ...builder import MODELS
+from ....metainfo import Models
+from ....outputs import OutputKeys
+from ....utils.constant import Tasks
+
+__all__ = ['PLUGForTextGeneration']
+
+@MODELS.register_module(Tasks.text_generation, module_name=Models.plug)
+class PlugForTextGeneration(TorchModel):
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        import torch
+
+        from transformers import BertTokenizer
+        from multiprocessing import Pool
+        from .arguments import get_args
+        from . import PlugNLGConfig
+
+        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+        model_config = PlugNLGConfig.from_pretrained(model_dir)
+        
+        # TODO(suluyan): Arguments
+        args = get_args()
+        args.world_size = 8
+        args.model_parallel_size = 8
+        args.pre_load = True
+        args.distributed_backend = 'nccl'
+        args.fp16 = True
+        args.fp32_layernorm = True
+        args.checkpoint_activations = True 
+        args.batch_size = 1
+        args.top_k = 20
+        args.top_p = 0.0
+        args.temperature = 0.9
+        self.args = args
+
+        self.world_size = args.world_size
+        ranks = list(range(self.world_size))
+        self.model_pool = Pool(self.world_size)
+        self.model_pool.map(partial(DistributedPlug.init, model_dir=model_dir, model_config=model_config, args=args), ranks)
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        return self.model(**input)
+
+    def generate(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        dec_input_ids = torch.full([self.args.batch_size, 1], self.tokenizer.cls_token_id, dtype=torch.long)
+        input["dec_input_ids"] = dec_input_ids
+        res = self.model_pool.map(DistributedPlug.forward, [input]*self.world_size)
+        return res[0]
+
+
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index b1d82557..58f2ba1c 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -153,7 +153,7 @@ class Pipeline(ABC):
             if self.device_name == 'gpu':
                 device = create_device()
                 if device.type == 'gpu':
-                    torch.cuda.set_device(device)
+                    pass #torch.cuda.set_device(device)
             yield
         else:
             yield
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 3d27ffa9..58f07d4c 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -8,6 +8,7 @@ from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.outputs import OutputKeys
 
 __all__ = ['TextGenerationPipeline']
 
@@ -56,6 +57,7 @@ class TextGenerationPipeline(Pipeline):
                 sequence_length=kwargs.pop('sequence_length', 128))
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.tokenizer = preprocessor.tokenizer
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
@@ -72,4 +74,6 @@ class TextGenerationPipeline(Pipeline):
         Returns:
             Dict[str, str]: the prediction results
         """
-        return inputs
+        generate_context = inputs["generate_context"]
+        generate_context = "".join(self.tokenizer.convert_ids_to_tokens(generate_context)).replace('[UNK]', '“').replace('##','')
+        return {OutputKeys.TEXT: generate_context}
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index f231df9a..017bcef8 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -161,7 +161,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         """
 
         model_type = get_model_type(model_dir)
-        if model_type in (Models.structbert, Models.gpt3, Models.palm):
+        if model_type in (Models.structbert, Models.gpt3, Models.palm, Models.plug):
             from modelscope.models.nlp.structbert import SbertTokenizer
             return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
         elif model_type == Models.veco:
diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py
new file mode 100755
index 00000000..8403544d
--- /dev/null
+++ b/modelscope/utils/nlp/distributed.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+import torch.distributed as dist
+from torch.nn.modules import Module
+from torch.autograd import Variable
+from sofa.utils import mpu
+
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+        self.data_parallel_group = mpu.get_data_parallel_group()
+        src_rank = mpu.get_model_parallel_rank()
+        for p in self.module.parameters():
+            if torch.is_tensor(p):
+                dist.broadcast(p, src_rank, group=self.data_parallel_group)
+
+        def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False):
+            if(self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for name, param in self.module.named_parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = (param.data.type())
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+                              " It is recommended to use the NCCL backend in this case.")
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    if fp32_allreduce:
+                        coalesced = coalesced.float()
+                    if not no_scale and not reduce_after:
+                        coalesced /= dist.get_world_size(group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
+                    torch.cuda.synchronize()
+                    if not no_scale and reduce_after:
+                        coalesced /= dist.get_world_size(group=self.data_parallel_group)
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+        self.hook_handles = []
+        self.hooks = []
+        for param in list(self.module.parameters()):
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+        #    handle = param.register_hook(allreduce_hook)
+            #self.hooks.append(allreduce_hook)
+            #self.hook_handles.append(handle)
+        self.allreduce_params = allreduce_params
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        #[h.remove() for h in self.hook_handles]
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+       # for handle, hook in zip(self.hook_handles, self.hooks):
+       #     d = handle.hooks_dict_ref()
+       #     d[handle.id] = hook
+
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+    def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
+
diff --git a/modelscope/utils/nlp/fp16/__init__.py b/modelscope/utils/nlp/fp16/__init__.py
new file mode 100755
index 00000000..a2c68a1f
--- /dev/null
+++ b/modelscope/utils/nlp/fp16/__init__.py
@@ -0,0 +1,30 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fp16util import (
+    BN_convert_float,
+    network_to_half,
+    prep_param_lists,
+    model_grads_to_master_grads,
+    master_params_to_model_params,
+    tofp16,
+    to_python_float,
+    clip_grad_norm,
+    convert_module,
+    convert_network,
+    FP16Model,
+)
+
+from .fp16 import *
+from .loss_scaler import *
diff --git a/modelscope/utils/nlp/fp16/fp16.py b/modelscope/utils/nlp/fp16/fp16.py
new file mode 100755
index 00000000..c1c6af57
--- /dev/null
+++ b/modelscope/utils/nlp/fp16/fp16.py
@@ -0,0 +1,629 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Stable version of apex FP16 Optimizer"""
+import torch
+from torch import nn
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from .loss_scaler import DynamicLossScaler, LossScaler
+from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
+
+FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn =  [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, FLOAT_TYPES):
+            val = val.half()
+        return val
+    return conversion_helper(val, half_conversion)
+
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, HALF_TYPES):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+
+class FP16_Module(nn.Module):
+    def __init__(self, module):
+        super(FP16_Module, self).__init__()
+        self.add_module('module', module.half())
+
+    def forward(self, *inputs, **kwargs):
+        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+
+# TODO:  Update overflow check + downscale to use Carl's fused kernel.
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, 
+    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
+    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
+    and changing the call to ``backward``.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary. 
+
+    Args:
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.  
+        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
+        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
+        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
+
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.  
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be 
+    named to replace ``init_optimizer``, for two reasons:  
+    First, it means that references to the same name
+    later in the file will not have to change.  
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to 
+    modify ``init_optimizer``.  If you do choose a unique name for the new
+    :class:`FP16_Optimizer` instance, you should only work with this new instance,
+    because the preexisting optimizer might no longer behave as expected.
+
+    ``init_optimizer`` may be any Pytorch optimizer. 
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of 
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will 
+    ingest these ``param_groups`` and remember them. 
+
+    Calls to ::
+
+        loss.backward() 
+
+    must be replaced with ::
+
+        optimizer.backward(loss)  
+
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement 
+    loss scaling and copies to master gradients.
+
+    .. note::
+        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
+        are downscaled before being applied.  This means that adjusting the loss scale, or using
+        dynamic loss scaling, should not require retuning the learning rate or any other 
+        hyperparameters.
+
+
+    **Advanced options**
+
+    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
+    See docstring for :attr:`step`.
+
+    **Gradient clipping**:  Use :attr:`clip_master_grads`.
+    
+    **Multiple losses**:  If your model accumulates gradients from multiple losses,
+    this can be made more efficient by supplying ``update_master_grads=False``
+    to :attr:`backward`.  See docstring for :attr:`backward`.
+
+    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
+
+        print(optimizer.loss_scale)
+        optimizer.loss_scale = new_loss_scale
+
+    For static loss scaling, manually adjusting the loss scale over time is a reasonable
+    thing to do.  During later epochs, gradients may become smaller, and a 
+    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting 
+    the loss scale is not recommended.
+
+    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` 
+    should still work as intended.
+    """
+
+    def __init__(self, 
+                 init_optimizer, 
+                 static_loss_scale=1.0, 
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=False):
+        if not torch.cuda.is_available:
+            raise SystemError("Cannot use fp16 without CUDA.")
+
+        self.verbose = verbose
+
+        self.optimizer = init_optimizer
+        # init_state_dict sets up an alternative way to cast per-param state tensors.
+        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+        # init_state_dict = init_optimizer.state_dict()
+
+        self.fp16_groups = []
+        self.fp32_from_fp16_groups = []
+        self.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+                                         .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        # Copythe model parallel flag.
+                        master_param.model_parallel = param.model_parallel
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.optimizer.state:
+                           self.optimizer.state[master_param] = self.optimizer.state.pop(param) 
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+                                         .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError("Wrapped parameters must be either "
+                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "  
+                                        "Received {}".format(param.type()))
+            
+            self.fp16_groups.append(fp16_params_this_group)
+            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+        # alternative way to cast per-param state tensors:
+        # self.optimizer.load_state_dict(init_state_dict)
+
+        if dynamic_loss_scale:
+            self.dynamic_loss_scale = True
+            if dynamic_loss_args is not None:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+            else:
+                self.loss_scaler = DynamicLossScaler()
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(static_loss_scale)
+
+        self.overflow = False
+        self.first_closure_call_this_step = True
+
+        self.clip_grad_norm = clip_grad_norm
+
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
+            
+    def __getstate__(self):
+        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
+
+    def __setstate__(self, state):
+        raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")
+
+    def zero_grad(self, set_grads_to_None=False):
+        """
+        Zero fp32 and fp16 parameter grads.
+        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
+        for group in self.optimizer.param_groups:
+             for p in group['params']:
+                 if set_grads_to_None:
+                     p.grad = None
+                 else:
+                     if p.grad is not None:
+                         p.grad.detach_()
+                         p.grad.zero_()
+
+        # Zero fp16 gradients owned by the model:
+        for fp16_group in self.fp16_groups:
+            for param in fp16_group:
+                if set_grads_to_None:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
+                        param.grad.zero_()
+
+    def _check_overflow(self):
+        params = [] 
+        for group in self.fp16_groups:
+            for param in group:
+                params.append(param)
+        for group in self.fp32_from_fp32_groups:
+            for param in group:
+                params.append(param)
+        self.overflow = self.loss_scaler.has_overflow(params)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    def _master_params_to_model_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+    def _model_params_to_master_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
+
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable 
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    def _model_grads_to_master_grads(self):
+        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
+            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+
+    def _downscale_master(self):
+        if self.loss_scale != 1.0:
+            for group in self.optimizer.param_groups:
+                for param in group['params']:
+                    if param.grad is not None:
+                        param.grad.data.mul_(1./self.loss_scale)
+
+    def clip_master_grads(self, max_norm, norm_type=2):
+        """
+        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp32 gradients (viewed as a single vector).
+
+        .. warning::
+            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
+        """
+        if not self.overflow:
+            fp32_params = []
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    fp32_params.append(param)
+            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+        else:
+            return -1
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict(). 
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, 
+        whose parameters in turn came from ``model``, it is expected that the user 
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+
+        Example::
+
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.  
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.  
+        # 1:  Refresh the master params from the model's fp16 params.  
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        # 
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device 
+        # of their associated parameters, because it's possible those buffers might not exist yet in 
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been 
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
+            for current, saved in zip(current_group, saved_group):
+                current.data.copy_(saved.data)
+
+    def step(self, closure=None): # could add clip option.
+        """
+        If no closure is supplied, :attr:`step` should be called after 
+        ``fp16_optimizer_obj.backward(loss)``.
+        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+        another forward pass using their model.
+
+        If a closure is supplied, :attr:`step` may be called without a prior call to 
+        :attr:`backward(loss)`.
+        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+        However, the user should take care that any ``loss.backward()`` call within the closure
+        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+
+        Args:
+           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
+
+        Example with closure::
+
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an 
+            # existing pytorch optimizer.
+            for input, target in dataset:
+                def closure():
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                    # loss.backward() becomes:
+                    optimizer.backward(loss)
+                    return loss
+                optimizer.step(closure)
+
+        .. warning::
+            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+
+        .. _`ordinary Pytorch optimizer use`:
+            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+        """
+
+        scale = self.loss_scaler.loss_scale
+        self._update_scale(self.overflow)
+
+        if self.overflow:
+            self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}"
+                .format(scale, self.loss_scale))
+            return
+        
+        if closure is not None:
+            retval = self._step_with_closure(closure)
+        else:
+            retval = self.optimizer.step()
+
+        self._master_params_to_model_params()
+
+        return retval
+
+    def _step_with_closure(self, closure):
+        def wrapped_closure():
+            # helpful for debugging
+            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+            #       .format(self.first_closure_call_this_step))
+            if self.first_closure_call_this_step:
+                # We expect that the fp16 params are initially fresh on entering self.step(),
+                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+                # is called within self.optimizer.step().
+                self.first_closure_call_this_step = False
+            else:
+                # If self.optimizer.step() internally calls wrapped_closure more than once,
+                # it may update the fp32 params after each call.  However, self.optimizer 
+                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+                # to handle that manually:
+                self._master_params_to_model_params()
+            # Our API expects the user to give us ownership of the backward() call by
+            # replacing all calls to loss.backward() with optimizer.backward(loss).
+            # This requirement holds whether or not the call to backward() is made within a closure.
+            # If the user is properly calling optimizer.backward(loss) within "closure," 
+            # calling closure() here will give the fp32 master params fresh gradients
+            # for the optimizer to play with, so all wrapped_closure needs to do is call 
+            # closure() and return the loss.
+            temp_loss = closure() 
+            while(self.overflow):
+                scale = self.loss_scaler.loss_scale
+                self._update_scale(self.overflow)
+                self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, "
+                      "reducing to {}".format(scale, self.loss_scale))
+                temp_loss = closure()
+            return temp_loss
+
+        retval = self.optimizer.step(wrapped_closure)
+
+        self.first_closure_call_this_step = True
+
+        return retval
+
+    def backward(self, loss, update_master_grads=True, retain_graph=False):
+        """ 
+        :attr:`backward` performs the following conceptual steps:
+
+        1. fp32_loss = loss.float() (see first Note below)
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
+        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
+        5. Finally, master grads are divided by loss_scale.
+
+        In this way, after :attr:`backward`, the master params have fresh gradients,
+        and :attr:`step` may be called.
+
+        .. note::
+            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+            This provides some additional safety against overflow if the user has supplied an 
+            fp16 loss value.  
+            However, for maximum overflow safety, the user should
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to 
+            :attr:`backward`.
+
+        .. warning::
+            The gradients found in a model's leaves after the call to 
+            :attr:`backward` should not be regarded as valid in general, 
+            because it's possible 
+            they have been scaled (and in the case of dynamic loss scaling, 
+            the scale factor may change over time).  
+            If the user wants to inspect gradients after a call to :attr:`backward`,  
+            only the master gradients should be regarded as valid.  These can be retrieved via
+            :attr:`inspect_master_grad_data()`.
+
+        Args:
+            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
+            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
+
+        Example::
+
+            # Ordinary operation:
+            optimizer.backward(loss)
+
+            # Naive operation with multiple losses (technically valid, but less efficient):
+            # fp32 grads will be correct after the second call,  but 
+            # the first call incurs an unnecessary fp16->fp32 grad copy.
+            optimizer.backward(loss1)
+            optimizer.backward(loss2)
+
+            # More efficient way to handle multiple losses:
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all 
+            # losses have been accumulated.
+            optimizer.backward(loss1, update_master_grads=False)
+            optimizer.backward(loss2, update_master_grads=False)
+            optimizer.update_master_grads()
+        """ 
+        # To consider:  try multiple backward passes using retain_grad=True to find 
+        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid 
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.  
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if update_master_grads:
+            self.update_master_grads()
+
+    def update_master_grads(self):
+        """
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to 
+        the ``.grad`` attribute of the fp32 master parameters that are directly 
+        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+        """
+        if self.dynamic_loss_scale:
+            self._check_overflow()
+            if self.overflow: return
+        self._model_grads_to_master_grads()
+        self._downscale_master()
+
+    def inspect_master_grad_data(self):
+        """
+        When running with :class:`FP16_Optimizer`, 
+        ``.grad`` attributes of a model's fp16 leaves should not be
+        regarded as truthful, because they might be scaled.  
+        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+        the fp32 master params' ``.grad``
+        attributes will contain valid gradients properly divided by the loss scale.  However, 
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be 
+        nonintuitive.  :attr:`inspect_master_grad_data`
+        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+
+        Returns:
+            List of lists (one list for each parameter group).  The list for each parameter group
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.                 
+        """
+        if self.overflow:
+            print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
+                  "Gradients are currently invalid (may be inf, nan, or stale).  Returning None.")
+            return None
+        else:
+            # The optimizer owns only references to master params.
+            master_grads_data = []
+            for param_group in self.optimizer.param_groups:
+                master_grads_this_group = []
+                for param in param_group['params']:
+                    if param.grad is not None:
+                        master_grads_this_group.append(param.grad.data)
+                    else:
+                        master_grads_this_group.append(None)
+                master_grads_data.append(master_grads_this_group)
+            return master_grads_data
+
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
diff --git a/modelscope/utils/nlp/fp16/fp16util.py b/modelscope/utils/nlp/fp16/fp16util.py
new file mode 100755
index 00000000..08d13733
--- /dev/null
+++ b/modelscope/utils/nlp/fp16/fp16util.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from sofa.utils import mpu
+
+
+class tofp16(nn.Module):
+    """
+    Utility module that implements::
+
+        def forward(self, input):
+            return input.half()
+    """
+
+    def __init__(self):
+        super(tofp16, self).__init__()
+
+    def forward(self, input):
+        return input.half()
+
+
+def BN_convert_float(module):
+    """
+    Utility function for network_to_half().
+
+    Retained for legacy purposes.
+    """
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+        module.float()
+    for child in module.children():
+        BN_convert_float(child)
+    return module
+
+
+def network_to_half(network):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+
+    Retained for legacy purposes. It is recommended to use FP16Model.
+    """
+    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
+
+
+def convert_module(module, dtype):
+    """
+    Converts a module's immediate parameters and buffers to dtype.
+    """
+    for param in module.parameters(recurse=False):
+        if param is not None:
+            if param.data.dtype.is_floating_point:
+                param.data = param.data.to(dtype=dtype)
+            if param._grad is not None and param._grad.data.dtype.is_floating_point:
+                param._grad.data = param._grad.data.to(dtype=dtype)
+
+    for buf in module.buffers(recurse=False):
+        if buf is not None and buf.data.dtype.is_floating_point:
+            buf.data = buf.data.to(dtype=dtype)
+
+
+def convert_network(network, dtype):
+    """
+    Converts a network's parameters and buffers to dtype.
+    """
+    for module in network.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+            continue
+        convert_module(module, dtype)
+    return network
+
+
+class FP16Model(nn.Module):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+    """
+
+    def __init__(self, network):
+        super(FP16Model, self).__init__()
+        self.network = convert_network(network, dtype=torch.half)
+
+    def forward(self, *inputs):
+        inputs = tuple(t.half() for t in inputs)
+        return self.network(*inputs)
+
+
+def backwards_debug_hook(grad):
+    raise RuntimeError("master_params recieved a gradient in the backward pass!")
+
+def prep_param_lists(model, flat_master=False):
+    """
+    Creates a list of FP32 master parameters for a given model, as in
+    `Training Neural Networks with Mixed Precision:  Real Examples`_.
+
+    Args:
+        model (torch.nn.Module): Existing Pytorch model
+        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
+    Returns:
+        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
+
+    Example::
+
+        model_params, master_params = prep_param_lists(model)
+
+    .. warning::
+        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
+
+    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
+        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+    """
+    model_params = [param for param in model.parameters() if param.requires_grad]
+
+    if flat_master:
+        # Give the user some more useful error messages
+        try:
+            # flatten_dense_tensors returns a contiguous flat array.
+            # http://pytorch.org/docs/master/_modules/torch/_utils.html
+            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
+        except:
+            print("Error in prep_param_lists:  model may contain a mixture of parameters "
+                      "of different types.  Use flat_master=False, or use F16_Optimizer.")
+            raise
+        master_params = torch.nn.Parameter(master_params)
+        master_params.requires_grad = True
+        # master_params.register_hook(backwards_debug_hook)
+        if master_params.grad is None:
+            master_params.grad = master_params.new(*master_params.size())
+        return model_params, [master_params]
+    else:
+        master_params = [param.clone().float().detach() for param in model_params]
+        for param in master_params:
+            param.requires_grad = True
+        return model_params, master_params
+
+
+def model_grads_to_master_grads(model_params, master_params, flat_master=False):
+    """
+    Copy model gradients to master gradients.  
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
+    """
+    if flat_master:
+        # The flattening may incur one more deep copy than is necessary.
+        master_params[0].grad.data.copy_(
+            _flatten_dense_tensors([p.grad.data for p in model_params]))
+    else:
+        for model, master in zip(model_params, master_params):
+            if model.grad is not None:
+                if master.grad is None:
+                    master.grad = Variable(master.data.new(*master.data.size()))
+                master.grad.data.copy_(model.grad.data)
+            else:
+                master.grad = None
+
+
+def master_params_to_model_params(model_params, master_params, flat_master=False):
+    """
+    Copy master parameters to model parameters.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
+    """
+    if flat_master:
+        for model, master in zip(model_params, 
+                                 _unflatten_dense_tensors(master_params[0].data, model_params)):
+            model.data.copy_(master)
+    else:
+        for model, master in zip(model_params, master_params):
+            model.data.copy_(master.data)
+
+# Backward compatibility fixes
+
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+clip_grad_norm = mpu.clip_grad_norm
+#elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
+#    clip_grad_norm = torch.nn.utils.clip_grad_norm
+#else:
+#    clip_grad_norm = torch.nn.utils.clip_grad_norm_
diff --git a/modelscope/utils/nlp/fp16/loss_scaler.py b/modelscope/utils/nlp/fp16/loss_scaler.py
new file mode 100755
index 00000000..e92d8c03
--- /dev/null
+++ b/modelscope/utils/nlp/fp16/loss_scaler.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from sofa.utils import mpu
+
+# item() is a recent addition, so this helps with backward compatibility.
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+class LossScaler:
+    """
+    Class that manages a static loss scale.  This class is intended to interact with
+    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
+
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to 
+    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
+    """
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss*self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+class DynamicLossScaler:
+    """
+    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of 
+    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
+    operates, because the default options can be changed using the
+    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
+
+    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
+    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has 
+    occurred.
+    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.  
+    If a certain number of iterations occur without overflowing gradients detected,
+    :class:`DynamicLossScaler` increases the loss scale once more.
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of 
+    always using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. 
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
+    """
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000,
+                 min_scale=1,
+                 delayed_shift=1,
+                 consecutive_hysteresis=False):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.min_scale = min_scale
+        self.delayed_shift = delayed_shift
+        self.cur_hysteresis = delayed_shift
+        self.consecutive_hysteresis = consecutive_hysteresis
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params):
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        torch.distributed.all_reduce(overflow_gpu,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=mpu.get_model_parallel_group())
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if 
+            # Pytorch's .sum() creates a one-element tensor of the same type as x 
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if "value cannot be converted" not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    # `overflow` is boolean indicating whether the gradient overflowed
+    def update_scale(self, overflow):
+
+        if not hasattr(self, 'min_scale'):
+            self.min_scale = 1
+        if not hasattr(self, 'delayed_shift'):
+            self.delayed_shift = 1
+        if not hasattr(self, 'cur_hysteresis'):
+            self.cur_hysteresis = 1
+        if not hasattr(self, 'consecutive_hysteresis'):
+            self.consecutive_hysteresis = True
+        if overflow:
+            # self.cur_scale /= self.scale_factor
+            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
+                self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale)
+            else:
+                self.cur_hysteresis -= 1
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if self.consecutive_hysteresis:
+                self.cur_hysteresis = self.delayed_shift
+            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
+                if not self.consecutive_hysteresis:
+                    self.cur_hysteresis = self.delayed_shift
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss*self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+        
+##############################################################        
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+"""
+TO-DO separate out into an example.
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+        
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+        
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
+"""
diff --git a/modelscope/utils/nlp/load_checkpoint.py b/modelscope/utils/nlp/load_checkpoint.py
new file mode 100755
index 00000000..e92e0974
--- /dev/null
+++ b/modelscope/utils/nlp/load_checkpoint.py
@@ -0,0 +1,102 @@
+import os 
+import torch
+
+def load_checkpoint(model,
+                    load_dir,
+                    tag,
+                    load_module_strict=True,
+                    load_optimizer_states=True,
+                    load_lr_scheduler_states=True):
+    r"""Load training checkpoint
+
+    Arguments:
+        load_dir: Required. Directory to load the checkpoint from
+        tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step.
+        load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and checkpoint match.
+        load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance
+        load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
+    Return:
+        load_path: Path of the loaded checkpoint. None if loading the checkpoint failed
+        client_state: State dictionary used for loading required training states in the client code.
+    """
+
+    load_path, client_states = _load_checkpoint(model,
+                                                load_dir,
+                                                tag,
+                                                load_module_strict=load_module_strict,
+                                                load_optimizer_states=load_optimizer_states,
+                                                load_lr_scheduler_states=load_lr_scheduler_states)
+
+    if load_optimizer_states:
+        if model.zero_optimization() and load_path is not None:
+            model._load_zero_checkpoint(load_dir,
+                                       tag,
+                                       load_optimizer_states=load_optimizer_states)
+
+    return load_path, client_states
+
+def _get_ckpt_name(mpu, checkpoints_path, tag):
+    mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank()
+    ckpt_name = os.path.join(checkpoints_path,
+                             str(tag),
+                             'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt')
+    return ckpt_name
+
+def pre_load(mpu,
+             load_dir,
+             tag=''):
+    load_path = _get_ckpt_name(mpu, load_dir, tag)
+    checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage)
+    return checkpoint['module']
+
+def _load_checkpoint(model,
+                     load_dir,
+                     tag,
+                     load_module_strict=True,
+                     load_optimizer_states=True,
+                     load_lr_scheduler_states=True):
+
+    load_path = model._get_ckpt_name(load_dir, tag)
+
+    if not os.path.exists(load_path):
+        return None, None
+
+    checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage)
+
+    model.load_module_state_dict(state_dict=checkpoint['module'],
+                                strict=load_module_strict)
+    if not model.zero_optimization() and load_optimizer_states:
+        if model.fp16_enabled():
+            model.optimizer.load_state_dict(
+                checkpoint['optimizer'],
+                load_optimizer_states=load_optimizer_states)
+        elif load_optimizer_states:
+            model.optimizer.load_state_dict(checkpoint['optimizer'])
+
+    if load_lr_scheduler_states and model.lr_scheduler is not None:
+        model.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+
+    model.csr_tensor_module_names = checkpoint['csr_tensor_module_names']
+    model.global_steps = checkpoint['global_steps']
+    model.global_samples = checkpoint.get('global_samples',
+                                         model.global_steps * model.train_batch_size())
+    model.skipped_steps = checkpoint['skipped_steps']
+    model.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
+    model.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
+    deepspeed_states = [
+        'module',
+        'optimizer',
+        'lr_scheduler',
+        'csr_tensor_module_names',
+        'skipped_steps',
+        'global_steps',
+        'dp_world_size',
+        'mp_world_size'
+    ]
+    client_state = {
+        key: value
+        for key,
+        value in checkpoint.items() if not key in deepspeed_states
+    }
+
+    return load_path, client_state
diff --git a/modelscope/utils/nlp/mpu/__init__.py b/modelscope/utils/nlp/mpu/__init__.py
new file mode 100755
index 00000000..e150e0c7
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/__init__.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model parallel utility interface."""
+
+from .cross_entropy import vocab_parallel_cross_entropy
+
+from .data import broadcast_data
+
+from .grads import clip_grad_norm
+
+from .initialize import destroy_model_parallel
+from .initialize import get_data_parallel_group
+from .initialize import get_data_parallel_rank
+from .initialize import get_data_parallel_world_size
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_src_rank
+from .initialize import get_model_parallel_world_size
+from .initialize import initialize_model_parallel
+from .initialize import model_parallel_is_initialized
+
+from .layers import ColumnParallelLinear
+from .layers import ParallelEmbedding
+from .layers import RowParallelLinear
+from .layers import VocabParallelEmbedding
+
+from .mappings import copy_to_model_parallel_region
+from .mappings import gather_from_model_parallel_region
+from .mappings import reduce_from_model_parallel_region
+from .mappings import scatter_to_model_parallel_region
+from .mappings import _gather
+
+from .random import checkpoint
+from .random import partition_activations_in_checkpoint
+from .random import get_cuda_rng_tracker
+from .random import model_parallel_cuda_manual_seed
+
+from .transformer import BertParallelSelfAttention
+from .transformer import BertParallelTransformerLayer
+from .transformer import GPT2ParallelTransformer, GPT2ParallelSelfAttention
+from .transformer import PalmParallelCrossAttention
+from .transformer import LayerNorm
diff --git a/modelscope/utils/nlp/mpu/binarizer.py b/modelscope/utils/nlp/mpu/binarizer.py
new file mode 100644
index 00000000..2bdebd1b
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/binarizer.py
@@ -0,0 +1,209 @@
+# coding=utf-8
+# Copyright 2020-present, AllenAI Authors, University of Illinois Urbana-Champaign,
+# Intel Nervana Systems and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Binarizers take a (real value) matrix as input and produce a binary (values in {0,1}) mask of the same shape.
+"""
+
+import torch
+from torch import autograd
+
+
+class ThresholdBinarizer(autograd.Function):
+    """
+    Thresholdd binarizer.
+    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j} > \tau`
+    where `\tau` is a real value threshold.
+
+    Implementation is inspired from:
+        https://github.com/arunmallya/piggyback
+        Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights
+        Arun Mallya, Dillon Davis, Svetlana Lazebnik
+    """
+
+    @staticmethod
+    def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input matrix from which the binarizer computes the binary mask.
+            threshold (`float`)
+                The threshold value (in R).
+            sigmoid (`bool`)
+                If set to ``True``, we apply the sigmoid function to the `inputs` matrix before comparing to `threshold`.
+                In this case, `threshold` should be a value between 0 and 1.
+        Returns:
+            mask (`torch.FloatTensor`)
+                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
+                retained, 0 - the associated weight is pruned).
+        """
+        nb_elems = inputs.numel()
+        nb_min = int(0.005 * nb_elems) + 1
+        if sigmoid:
+            mask = (torch.sigmoid(inputs) > threshold).type(inputs.type()).bool()
+        else:
+            mask = (inputs > threshold).type(inputs.type()).bool()
+        if mask.sum() < nb_min:
+            # We limit the pruning so that at least 0.5% (half a percent) of the weights are remaining
+            k_threshold = inputs.flatten().kthvalue(max(nb_elems - nb_min, 1)).values
+            mask = (inputs > k_threshold).type(inputs.type()).bool()
+        return mask
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return gradOutput, None, None
+
+
+class TopKBinarizer(autograd.Function):
+    """
+    Top-k Binarizer.
+    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
+    is among the k% highest values of S.
+
+    Implementation is inspired from:
+        https://github.com/allenai/hidden-networks
+        What's hidden in a randomly weighted neural network?
+        Vivek Ramanujan*, Mitchell Wortsman*, Aniruddha Kembhavi, Ali Farhadi, Mohammad Rastegari
+    """
+
+    @staticmethod
+    def forward(ctx, inputs: torch.tensor, threshold: float, k_threshold=None):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input matrix from which the binarizer computes the binary mask.
+            threshold (`float`)
+                The percentage of weights to keep (the rest is pruned).
+                `threshold` is a float between 0 and 1.
+        Returns:
+            mask (`torch.FloatTensor`)
+                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
+                retained, 0 - the associated weight is pruned).
+        """
+        # Get the subnetwork by sorting the inputs and using the top threshold %
+        if k_threshold is None:
+            mask = inputs.clone()
+            _, idx = inputs.flatten().sort(descending=True)
+            j = int(threshold * inputs.numel())
+            # flat_out and mask access the same memory.
+            flat_out = mask.flatten()
+            flat_out[idx[j:]] = 0
+            flat_out[idx[:j]] = 1
+
+            # if threshold == 1:
+            #     k_threshold = -1000
+            # else:
+            #     n = inputs.numel()
+            #     kth = min(max(n - (int(n * threshold) + 1), 1), n)
+            #     k_threshold = inputs.flatten().kthvalue(kth).values
+            # mask = (inputs > k_threshold).type(inputs.type())
+        else:
+            if threshold == 1.0:
+                mask = (inputs > -1000).type(inputs.type())
+            else:
+                mask = (inputs > k_threshold).type(inputs.type())
+
+        # # Get the subnetwork by get the kthvalue
+        # # ==> This method will cause bug since if all the mask_scores are the same, the mask is all zero.
+        # n = inputs.numel()
+        # kth = max(n - (int(n * threshold) + 1), 1)
+        # k_threshold = inputs.flatten().kthvalue(kth).values
+        # mask = (inputs > k_threshold).type(inputs.type())
+
+        # if torch.distributed.get_rank() == 0:
+            # print("inputs:")
+            # print(inputs, flush=True)
+            # print('inputs isinf:')
+            # print(torch.isinf(inputs), flush=True)
+            # print('inputs isinf number:')
+            # print(torch.isinf(inputs).sum(), flush=True)
+            #
+            # print('\n\n\nMask:')
+            # print(mask, flush=True)
+            # print('Mask isinf:')
+            # print(torch.isinf(mask), flush=True)
+            # print('Mask isinf number:')
+            # print(torch.isinf(mask).sum(), flush=True)
+            # print('Mask sum:')
+            # print(torch.sum(mask), flush=True)
+            #
+            # print('inputs (mask_scores).mean(): ', inputs.mean().detach().cpu(), flush=True)
+            # print('inputs (mask_scores).max(): ', inputs.max().detach().cpu(), flush=True)
+            # print('inputs (mask_scores).min(): ', inputs.min().detach().cpu(), flush=True)
+            # print('inputs is all 0?', (inputs != torch.tensor(0).type(inputs.type())).sum().detach().cpu().numpy())
+            # print("\n\n\nMask ratio: {}/{}={}".format(mask.sum().detach().cpu(), inputs.numel(), (mask.sum().detach().cpu().numpy()) / float(inputs.numel())), flush=True)
+            # print("threshold: {}".format(threshold), flush=True)
+
+        return mask
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return gradOutput, None, None
+
+
+class MagnitudeBinarizer(object):
+    """
+    Magnitude Binarizer.
+    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
+    is among the k% highest values of |S| (absolute value).
+
+    Implementation is inspired from https://github.com/NervanaSystems/distiller/blob/2291fdcc2ea642a98d4e20629acb5a9e2e04b4e6/distiller/pruning/automated_gradual_pruner.py#L24
+    """
+
+    @staticmethod
+    def apply(inputs: torch.tensor, threshold: float):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input matrix from which the binarizer computes the binary mask.
+                This input marix is typically the weight matrix.
+            threshold (`float`)
+                The percentage of weights to keep (the rest is pruned).
+                `threshold` is a float between 0 and 1.
+        Returns:
+            mask (`torch.FloatTensor`)
+                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
+                retained, 0 - the associated weight is pruned).
+        """
+        # Get the subnetwork by sorting the inputs and using the top threshold %
+        mask = inputs.clone()
+        _, idx = inputs.abs().flatten().sort(descending=True)
+        j = int(threshold * inputs.numel())
+        # flat_out and mask access the same memory.
+        flat_out = mask.flatten()
+        flat_out[idx[j:]] = 0
+        flat_out[idx[:j]] = 1
+        # mask = mask.bool()
+        return mask
+
+        # # Get the subnetwork by sorting the inputs and using the top threshold
+        # # ==> This method will cause bug since if all the mask_scores are the same, the mask is all zero.
+        # n = inputs.numel()
+        # kth = max(n - (int(n * threshold) + 1), 1)
+        # k_threshold = inputs.abs().flatten().kthvalue(kth).values
+        # mask = (inputs > k_threshold).type(inputs.type())
+        # return mask
+
+class MaskTaylor(autograd.Function):
+    @staticmethod
+    def forward(ctx, weight, mask):
+        ctx.save_for_backward(weight, mask)
+        return mask*weight
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        weight, mask, = ctx.saved_tensors
+        return gradOutput*mask, -torch.pow(gradOutput*weight, 2)
+        # return gradOutput*mask, -torch.abs(gradOutput*weight)
diff --git a/modelscope/utils/nlp/mpu/cross_entropy.py b/modelscope/utils/nlp/mpu/cross_entropy.py
new file mode 100755
index 00000000..845f0441
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/cross_entropy.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_world_size
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Copy so the input remains unchanged.
+        logits = vocab_parallel_logits.clone()
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=get_model_parallel_group())
+        # Subtract the maximum value.
+        logits.sub_(logits_max.unsqueeze(dim=-1))
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = logits.exp()
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_model_parallel_group())
+
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_model_parallel_rank()
+        world_size = get_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
+                                 device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_model_parallel_group())
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
+                                 device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (
+            1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/modelscope/utils/nlp/mpu/data.py b/modelscope/utils/nlp/mpu/data.py
new file mode 100755
index 00000000..5008a3a1
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/data.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_src_rank
+
+
+_MAX_DATA_DIM = 5
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, '{} has data type {} which '\
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(),
+                                group=get_model_parallel_group())
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
+                                                                          data)
+
+    # Pack on rank zero.
+    if get_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(total_numel,
+                                   device=torch.cuda.current_device(),
+                                   dtype=datatype)
+
+    # Boradcast
+    torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(),
+                                group=get_model_parallel_group())
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
diff --git a/modelscope/utils/nlp/mpu/grads.py b/modelscope/utils/nlp/mpu/grads.py
new file mode 100755
index 00000000..0ae9cf5d
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/grads.py
@@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+
+import torch
+from torch._six import inf
+
+from .initialize import get_model_parallel_group
+from .initialize import get_model_parallel_rank
+
+
+def clip_grad_norm(parameters, max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in parameters)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all GPUs.
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+    else:
+        total_norm = 0
+        for p in parameters:
+            if p.model_parallel or (get_model_parallel_rank() == 0):
+                param_norm = p.grad.data.norm(norm_type)
+                total_norm += param_norm.item() ** norm_type
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        for p in parameters:
+            p.grad.data.mul_(clip_coef)
+    return total_norm
diff --git a/modelscope/utils/nlp/mpu/initialize.py b/modelscope/utils/nlp/mpu/initialize.py
new file mode 100755
index 00000000..15656065
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/initialize.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+
+# Model parallel group that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+
+def initialize_model_parallel(model_parallel_size_):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        model_parallel_size: number of GPUs used to parallelize model.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model. The present function will
+    create 4 model parallel groups and 2 data parallel grous as:
+        4 model parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 data parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel with size {}'.format(
+            model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = min(model_parallel_size_, world_size)
+    ensure_divisibility(world_size, model_parallel_size)
+    rank = torch.distributed.get_rank()
+    # Build the data parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, \
+        'data parallel group is already initialized'
+    for i in range(model_parallel_size):
+        ranks = range(i, world_size, model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank % model_parallel_size):
+            _DATA_PARALLEL_GROUP = group
+
+    # Build the model parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, \
+        'model parallel group is already initialized'
+    for i in range(world_size // model_parallel_size):
+        ranks = range(i * model_parallel_size,
+                      (i + 1) * model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank // model_parallel_size):
+            _MODEL_PARALLEL_GROUP = group
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_model_parallel_world_size():
+    """Return world size for the model parallel group."""
+    return torch.distributed.get_world_size(group=get_model_parallel_group())
+
+
+def get_model_parallel_rank():
+    """Return my rank for the model parallel group."""
+    return torch.distributed.get_rank(group=get_model_parallel_group())
+
+
+def get_model_parallel_src_rank():
+    """Calculate the global rank corresponding to a local rank zeor
+    in the model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
diff --git a/modelscope/utils/nlp/mpu/layers.py b/modelscope/utils/nlp/mpu/layers.py
new file mode 100755
index 00000000..33247553
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/layers.py
@@ -0,0 +1,420 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_world_size
+from .mappings import copy_to_model_parallel_region
+from .mappings import gather_from_model_parallel_region
+from .mappings import reduce_from_model_parallel_region
+from .mappings import scatter_to_model_parallel_region
+from .random import get_cuda_rng_tracker
+from .utils import divide
+from .utils import split_tensor_along_last_dim
+from .utils import VocabUtility
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+
+from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer, MaskTaylor
+
+def _initialize_affine_weight(weight, output_size, input_size,
+                              per_partition_size, partition_dim, init_method,
+                              stride=1, return_master_weight=False):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+    # If we only use 1 process for model parallelism, bypass scatter.
+    world_size = get_model_parallel_world_size()
+    if world_size == 1:
+        init_method(weight)
+        if return_master_weight:
+            return weight
+        return None
+
+    # Initialize master weight
+    master_weight = torch.empty(output_size, input_size,
+                                dtype=weight.dtype,
+                                requires_grad=False)
+    init_method(master_weight)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(master_weight, per_partition_per_stride_size,
+                              dim=partition_dim)
+    rank = get_model_parallel_rank()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+    def __init__(self, num_embeddings, embedding_dim,
+                 init_method=init.xavier_normal_):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_model_parallel_rank(),
+                get_model_parallel_world_size())
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+                                            self.vocab_start_index
+
+        # Allocate weights.
+        self.weight = Parameter(torch.Tensor(self.num_embeddings_per_partition,
+                                             self.embedding_dim))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(
+            self.weight, self.num_embeddings, self.embedding_dim,
+            self.num_embeddings_per_partition, 0, init_method)
+        self.timers = SynchronizedWallClockTimer()
+
+    def forward(self, input_):
+        #self.timers('embedding').start()
+        # Build the mask.
+        input_mask = (input_ < self.vocab_start_index) | \
+                     (input_ >= self.vocab_end_index)
+        # Mask the input.
+        masked_input = input_.clone() - self.vocab_start_index
+        masked_input[input_mask] = 0
+        # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        #self.timers('embedding').stop() 
+        #self.timers('embedding reduce').start() 
+        output = reduce_from_model_parallel_region(output_parallel)
+        #self.timers('embedding reduce').stop()
+        #timer_names = ['embedding', 'embedding reduce']
+        #self.timers.log(names=timer_names)
+        return output
+
+
+class ParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the embedding dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+    def __init__(self, num_embeddings, embedding_dim,
+                 init_method=init.xavier_normal_,
+                 keep_master_weight_for_test=False):
+        super(ParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set some detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the embedding dimension.
+        world_size = get_model_parallel_world_size()
+        self.embedding_dim_per_partition = divide(self.embedding_dim,
+                                                  world_size)
+
+        # Allocate weights.
+        self.weight = Parameter(torch.Tensor(self.num_embeddings,
+                                             self.embedding_dim_per_partition))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(
+            self.weight, self.num_embeddings, self.embedding_dim,
+            self.embedding_dim_per_partition, 1, init_method,
+            stride=1, return_master_weight=False)
+
+    def forward(self, input_):
+        input_parallel = copy_to_model_parallel_region(input_)
+        output_parallel = F.embedding(input_parallel, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        output = gather_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+    def __init__(self, input_size, output_size, bias=True, gather_output=True,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0, 
+                 LR_weight_rank=8, LR_mask_rank=8):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(torch.Tensor(self.output_size_per_partition,
+                                             self.input_size))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size_per_partition))
+            self.bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight, self.output_size, self.input_size,
+            self.output_size_per_partition, 0, init_method,
+            stride=stride, return_master_weight=keep_master_weight_for_test)
+        # self.timers = SynchronizedWallClockTimer()
+
+        self.pruning_method = None
+
+    def init_mask(self):
+        if self.pruning_mask_init == "constant":
+            init.constant_(self.mask_scores, val=self.pruning_mask_scale)
+        elif self.pruning_mask_init == "uniform":
+            init.uniform_(self.mask_scores, a=-self.pruning_mask_scale, b=self.pruning_mask_scale)
+        elif self.pruning_mask_init == "kaiming":
+            init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5))
+
+    def load_mask(self, pruning_threshold, k_threshold=None):
+        if self.pruning_method in ["finetune"]:
+            # mask = TopKBinarizer.apply(self.mask_scores.cpu(), pruning_threshold, k_threshold)
+            if k_threshold is not None:
+                self.mask[self.mask_scores <= k_threshold] = 0
+            else:
+                _, idx = self.mask_scores.cpu().flatten().sort(descending=True)
+                j = int(pruning_threshold * self.mask_scores.numel())
+                flat_out = self.mask.flatten()
+                flat_out[idx[j:]] = 0
+                flat_out[idx[:j]] = 1
+            
+            self.__setattr__("mask_scores", None)
+            del self.mask_scores
+            self.weight.data = self.weight.data*self.mask.data
+
+            # mask = TopKBinarizer.apply(self.mask_scores, pruning_threshold, k_threshold)
+            # self.mask.data = mask.data
+            # self.weight.data = self.weight.data*self.mask.data
+            # del self.mask_scores
+            # self.__setattr__("mask_scores", None)
+            return True
+        else:
+            return False
+
+    def forward(self, input_, pruning_threshold=None):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_model_parallel_region(input_)
+
+        # Matrix multiply.
+        if hasattr(self, 'linear'):
+            output_parallel = self.linear(input_parallel, self.weight, self.bias)
+        else:
+            output_parallel = F.linear(input_parallel, self.weight, self.bias)
+
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+    def __init__(self, input_size, output_size, bias=True,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0,
+                 LR_weight_rank=8, LR_mask_rank=8):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(torch.Tensor(self.output_size,
+                                             self.input_size_per_partition))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size))
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight, self.output_size, self.input_size,
+            self.input_size_per_partition, 1, init_method,
+            stride=stride, return_master_weight=keep_master_weight_for_test)
+
+        self.pruning_method = None
+
+    def init_mask(self):
+        if self.pruning_mask_init == "constant":
+            init.constant_(self.mask_scores, val=self.pruning_mask_scale)
+        elif self.pruning_mask_init == "uniform":
+            init.uniform_(self.mask_scores, a=-self.pruning_mask_scale, b=self.pruning_mask_scale)
+        elif self.pruning_mask_init == "kaiming":
+            init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5))
+
+    def load_mask(self, pruning_threshold, k_threshold=None):
+        if self.pruning_method in ["finetune"]:
+            if k_threshold is not None:
+                self.mask[self.mask_scores <= k_threshold] = 0
+            else:
+                _, idx = self.mask_scores.cpu().flatten().sort(descending=True)
+                j = int(pruning_threshold * self.mask_scores.numel())
+                flat_out = self.mask.flatten()
+                flat_out[idx[j:]] = 0
+                flat_out[idx[:j]] = 1
+            
+            self.__setattr__("mask_scores", None)
+            del self.mask_scores
+            self.weight.data = self.weight.data*self.mask.data
+
+            # mask = TopKBinarizer.apply(self.mask_scores, pruning_threshold, k_threshold)
+            # self.mask.data = mask.data
+            # self.weight.data = self.weight.data*self.mask.data
+            # del self.mask_scores
+            # self.__setattr__("mask_scores", None)
+            return True
+        else:
+            return False
+
+    def forward(self, input_, pruning_threshold=None):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_model_parallel_region(input_)
+
+        # Matrix multiply.
+        if hasattr(self, 'linear'):
+            output_parallel = self.linear(input_parallel, self.weight)
+        else:
+            output_parallel = F.linear(input_parallel, self.weight)
+
+        # All-reduce across all the partitions.
+        output_ = reduce_from_model_parallel_region(output_parallel)
+        if self.bias is not None:
+            output = output_ + self.bias
+        else:
+            output = output_
+        return output
+
diff --git a/modelscope/utils/nlp/mpu/mappings.py b/modelscope/utils/nlp/mpu/mappings.py
new file mode 100755
index 00000000..ab7f4a69
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/mappings.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .utils import split_tensor_along_last_dim
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=group)
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Split along last dimension.
+    world_size = torch.distributed.get_world_size(group=group)
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = torch.distributed.get_rank(group=group)
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = torch.distributed.get_rank(group=group)
+    world_size = torch.distributed.get_world_size(group=group)
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=group)
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        #timers = SynchronizedWallClockTimer()
+        #timers('backward _Copy reduce').start()
+        out = _reduce(grad_output)
+        #timers('backward _Copy reduce').stop()
+        #timers.log(names=['backward _Copy reduce'])
+        return out
+        #return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-redcue the input from the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        #timers = SynchronizedWallClockTimer()
+        #timers('forward _Reduce reduce').start()
+        out = _reduce(input_)
+        #timers('forward _Reduce reduce').stop()
+        #timers.log(names=['forward _Reduce reduce'])
+        return out 
+        #return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        #timers = SynchronizedWallClockTimer()
+        #timers('backward _Scatter gather').start()
+        out = _gather(grad_output)
+        #timers('backward _Scatter gather').stop()
+        #timers.log(names=['backward _Scatter gather'])
+        return out
+        #return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        #timers = SynchronizedWallClockTimer()
+        #timers('forward _Gather gather').start()
+        out = _gather(input_)
+        #timers('forward _Gather gather').stop()
+        #timers.log(names=['forward _Gather gather'])
+        return out
+        #return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+def copy_to_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+def reduce_from_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+def scatter_to_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+def gather_from_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
diff --git a/modelscope/utils/nlp/mpu/random.py b/modelscope/utils/nlp/mpu/random.py
new file mode 100755
index 00000000..c6f5a946
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/random.py
@@ -0,0 +1,391 @@
+# coding=utf-8
+#Modified by Samyam Rajbhandari
+#Used to partition the activations stored for backward propagation
+#Therefore reduces the memory consumption
+
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+import contextlib
+import torch.distributed as dist
+import torch
+from torch import _C
+from torch.cuda import _lazy_call, device as device_ctx_manager
+#from torch.utils.checkpoint import detach_variable
+from ..utils import print_rank_0
+
+
+import torch.distributed as dist
+PARTITION_ACTIVATIONS = False
+PA_CORRECTNESS_TEST= False
+
+def see_memory_usage(message, force=False):
+    if not force:
+        return
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print(message)
+        print("Memory Allocated ", torch.cuda.memory_allocated()/(1024*1024*1024), "GigaBytes")
+        print("Max Memory Allocated ", torch.cuda.max_memory_allocated()/(1024*1024*1024), "GigaBytes")
+        print("Cache Allocated ", torch.cuda.memory_cached()/(1024*1024*1024), "GigaBytes")
+        print("Max cache Allocated ", torch.cuda.max_memory_cached()/(1024*1024*1024), "GigaBytes")
+        print(" ")
+        #input("Press Any Key To Continue ..")
+
+
+from .initialize import get_data_parallel_rank
+from .initialize import get_model_parallel_rank
+from .initialize import get_model_parallel_world_size
+from .initialize import get_model_parallel_group
+
+mp_rank = None #get_model_parallel_rank()
+mp_size = None #get_model_parallel_world_size()
+mp_group = None #get_model_parallel_group()
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+transport_stream = None 
+cuda_device=None
+def detach_variable(inputs, device=None):
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+
+            requires_grad = inp.requires_grad
+
+            if device is not None:
+                x = inp.to(device=device)
+            else:
+                x = inp
+ 
+            x = x.detach()
+            x.requires_grad = requires_grad
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-model-parallel regions.
+        model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    model_parallel_seed = offset + get_model_parallel_rank()
+    # Data parallel gets the original sedd.
+    data_parallel_seed = seed
+
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel cuda seeds on global rank {}, '
+              'model parallel rank {}, and data parallel rank {} with '
+              'model parallel seed: {} and data parallel seed: {}'.format(
+                  torch.distributed.get_rank(), get_model_parallel_rank(),
+                  get_data_parallel_rank(), model_parallel_seed,
+                  data_parallel_seed), flush=True)
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                model_parallel_seed)
+
+
+def get_partition_start(item):
+    global mp_rank, mp_size, mp_group
+    partition_size = get_partition_size(item)
+    start = partition_size * mp_rank
+    return int(start)
+
+def get_partition_size(item):
+    global mp_rank, mp_size, mp_group
+    size = item.numel()
+    partition_size = size/mp_size
+    return int(partition_size)
+    
+def get_full_inputs(tensors):
+    inputs=[]
+    for i in range(int(len(tensors)/2)-1):
+        item = tensors[2 * i]
+        size = tensors[2* i + 1]
+        partition_size = item.numel()
+        tensor_size = partition_size * mp_size
+        flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device)
+        partitions=[]
+        for i in range(mp_size):
+            part_i = flat_tensor.narrow(0, partition_size * i , partition_size)
+            if i == mp_rank:
+                part_i.copy_(item)
+            partitions.append(part_i)
+        dist.all_gather(partitions,partitions[mp_rank], group=mp_group)
+        input_tensor = flat_tensor.view(list(size.numpy()))
+        item.data=input_tensor.data
+
+        inputs.append(item)
+    inputs.append(tensors[-2])
+        
+    return tuple(inputs)
+
+        
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+        global mp_rank, mp_size, mp_group
+        if mp_rank is None:
+            mp_rank = get_model_parallel_rank()
+            mp_size = get_model_parallel_world_size()
+            mp_group = get_model_parallel_group()
+
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+        if cuda_device is None:
+            if dist.get_rank()  == 0:
+                print(f"Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}")
+            
+            cuda_device = torch.cuda.current_device()
+            #The transport stream is used to overlap the allgather communication for the activations
+            #with the computation in the backward pass
+            transport_stream = torch.cuda.Stream(device=cuda_device)
+
+        if PARTITION_ACTIVATIONS:
+            #inputs = [item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), get_partition_size(item)).clone() for item in args[:-1]]
+            #inputs.append(args[-1])
+            print_rank_0("args: ", args)
+            inputs = [item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), get_partition_size(item)).clone() for item in args[:2]]
+            inputs.extend(args[2:])
+
+
+        #just in case something funky is happening such as reuse of inputs
+        inputs_cuda = [item.to(cuda_device) for item in args]
+        
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        #ctx.save_for_backward(*args)
+        with torch.no_grad():
+            outputs = run_function(*inputs_cuda)
+
+        del inputs_cuda
+        
+        if PARTITION_ACTIVATIONS:
+            new_args = []
+            for arg, inp in zip(args,inputs):                
+                size= torch.tensor(arg.size())
+                arg.data = inp.data
+                new_args.append(arg)
+                new_args.append(size)
+            ctx.save_for_backward(*new_args)
+        else:
+            ctx.save_for_backward(*args)
+        
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                               "please use .backward() if possible")
+        
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+        
+        if PARTITION_ACTIVATIONS:
+            with torch.cuda.stream(transport_stream):
+                inputs = get_full_inputs(ctx.saved_tensors)
+                detached_inputs = detach_variable(inputs)
+        else:
+            inputs = ctx.saved_tensors
+            detached_inputs = detach_variable(inputs)
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+        
+        if PARTITION_ACTIVATIONS:
+            current_stream=torch.cuda.current_stream()
+            current_stream.wait_stream(transport_stream)
+
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        return (None,) + tuple(inp.grad for inp in detached_inputs)
+
+
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
+
+def partition_activations_in_checkpoint(partition_activation):
+    global PARTITION_ACTIVATIONS
+    PARTITION_ACTIVATIONS=partition_activation
+    if dist.get_rank()  == 0:
+        print(f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
+
+
diff --git a/modelscope/utils/nlp/mpu/tests/__init__.py b/modelscope/utils/nlp/mpu/tests/__init__.py
new file mode 100755
index 00000000..e69de29b
diff --git a/modelscope/utils/nlp/mpu/tests/commons.py b/modelscope/utils/nlp/mpu/tests/commons.py
new file mode 100755
index 00000000..be986e58
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/tests/commons.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import numpy
+import torch
+
+import mpu
+
+
+class IdentityLayer(torch.nn.Module):
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_rank', type=int, default=None,
+                        help='local rank passed from distributed launcher')
+    args = parser.parse_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    #torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
diff --git a/modelscope/utils/nlp/mpu/tests/test_cross_entropy.py b/modelscope/utils/nlp/mpu/tests/test_cross_entropy.py
new file mode 100755
index 00000000..20875971
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/tests/test_cross_entropy.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+sys.path.append("../..")
+
+import torch
+import torch.nn.functional as F
+import mpu
+from mpu.cross_entropy import vocab_parallel_cross_entropy
+
+from commons import initialize_distributed
+from commons import print_separator
+from commons import IdentityLayer
+from commons import set_random_seed
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size,
+                        logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
+                           target.view(-1),
+                           reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def mpu_cross_entropy(batch_size, seq_length, vocab_size,
+                      logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = mpu.scatter_to_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
+                                                 vocab_size, logits_scale,
+                                                 seed)
+    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
+                                           vocab_size, logits_scale,
+                                           seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/utils/nlp/mpu/tests/test_data.py b/modelscope/utils/nlp/mpu/tests/test_data.py
new file mode 100755
index 00000000..6e8eca73
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/tests/test_data.py
@@ -0,0 +1,92 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+import sys
+sys.path.append("../..")
+
+import torch
+import mpu
+from mpu import data as data_utils
+
+from commons import initialize_distributed
+from commons import print_separator
+
+
+def test_boradcast_data(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing boradcast_data with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    key_size_t = {'key1': [7, 11],
+                  'key2': [8, 2, 1],
+                  'key3': [13],
+                  'key4': [5, 1, 2],
+                  'key5': [5, 12]}
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if mpu.get_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test test boradcast data')
+        test_boradcast_data(model_parallel_size)
+        model_parallel_size *= 2
+
+
diff --git a/modelscope/utils/nlp/mpu/tests/test_initialize.py b/modelscope/utils/nlp/mpu/tests/test_initialize.py
new file mode 100755
index 00000000..c77e2e6a
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/tests/test_initialize.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append("../..")
+
+import torch
+import mpu
+
+from commons import initialize_distributed
+from commons import print_separator
+
+
+def test_initialize_model_parallel(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_model_parallel with size {} ...'.format(
+            model_parallel_size))
+    model_parallel_size_ = min(model_parallel_size,
+                               torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size_)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks.
+    def check(group, world_size, rank):
+        assert world_size == torch.distributed.get_world_size(group=group)
+        assert rank == torch.distributed.get_rank(group=group)
+
+    # Model parallel.
+    world_size = model_parallel_size_
+    rank = torch.distributed.get_rank() % model_parallel_size_
+    assert world_size == mpu.get_model_parallel_world_size()
+    assert rank == mpu.get_model_parallel_rank()
+    check(mpu.get_model_parallel_group(), world_size, rank)
+
+
+    # Data parallel.
+    world_size = torch.distributed.get_world_size() // model_parallel_size_
+    rank = torch.distributed.get_rank() // model_parallel_size
+    assert world_size == mpu.get_data_parallel_world_size()
+    assert rank == mpu.get_data_parallel_rank()
+    check(mpu.get_data_parallel_group(), world_size, rank)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_get_model_parallel_src_rank(model_parallel_size_):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing get_model_parallel_src_rank with size {} ...'.format(
+            model_parallel_size_))
+    model_parallel_size = min(model_parallel_size_,
+                              torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks
+    src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
+    assert mpu.get_model_parallel_src_rank() == src_rank
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test initialize model parallel')
+        test_initialize_model_parallel(model_parallel_size)
+        print_separator('test model parallel source rank')
+        test_get_model_parallel_src_rank(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/utils/nlp/mpu/tests/test_layers.py b/modelscope/utils/nlp/mpu/tests/test_layers.py
new file mode 100755
index 00000000..c38bf725
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/tests/test_layers.py
@@ -0,0 +1,529 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+sys.path.append("../..")
+
+import torch
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+import mpu
+
+from commons import initialize_distributed
+from commons import print_separator
+from commons import set_random_seed
+from mpu import layers
+
+
+def test_parallel_embedding(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing parallel embedding with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 17
+    seq_length = 23
+    vocab_size = 48
+    hidden_size = 16
+    seed = 1236
+
+    set_random_seed(123)
+    input_data = torch.LongTensor(
+        size=(batch_size,seq_length)).random_(0, vocab_size).cuda()
+    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
+
+    set_random_seed(seed)
+    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
+
+    output = embedding_original(input_data)
+    loss_original = torch.mul(output, loss_weight).sum()
+    loss_original.backward()
+
+    set_random_seed(seed)
+    embedding_parallel = layers.ParallelEmbedding(
+                vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_parallel(input_data)
+    loss_parallel = torch.mul(output, loss_weight).sum()
+    loss_parallel.backward()
+
+    set_random_seed(seed)
+    embedding_vocab_parallel = layers.VocabParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_vocab_parallel(input_data)
+    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
+    loss_vocab_parallel.backward()
+
+    torch.distributed.barrier()
+    error = loss_parallel.sub(loss_original).abs()
+    print('   error in loss (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    error = loss_vocab_parallel.sub(loss_original).abs()
+    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   hidden_size // model_parallel_size,
+                                   1)[mpu.get_model_parallel_rank()]
+    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
+    print('   error in grad (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   vocab_size // model_parallel_size,
+                                   0)[mpu.get_model_parallel_rank()]
+    error = embedding_vocab_parallel.weight.grad.sub(
+        weight_grad_orig).abs().max()
+    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_initialize_affine_weight(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_affine_weight with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+
+    # ---------------
+    # Column parallel
+    # ---------------
+    weight = torch.empty(output_size_coeff, input_size)
+    set_random_seed(seed)
+    layers._initialize_affine_weight(weight, output_size, input_size,
+
+                                     output_size_coeff, 0,
+                                     torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(master_weight, output_size_coeff,
+                            dim=0)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   column parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # ------------
+    # Row parallel
+    # ------------
+    weight = torch.empty(output_size, input_size_coeff)
+    set_random_seed(seed)
+    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
+                                         input_size_coeff, 1,
+                                         torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(master_weight, input_size_coeff,
+                            dim=1)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   row parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer2D(torch.nn.Module):
+    def __init__(self, m , n):
+        super(IdentityLayer2D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n))
+        torch.nn.init.xavier_normal_(self.weight)
+    def forward(self):
+        return self.weight
+
+
+def test_column_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing ColumnParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.ColumnParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(dLdA, output_size_coeff,
+                          dim=0)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    my_dLdb = torch.split(dLdb, output_size_coeff,
+                          dim=0)[rank].contiguous().clone()
+    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def test_row_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing RowParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.RowParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(dLdA, input_size_coeff,
+                          dim=1)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer3D(torch.nn.Module):
+    def __init__(self, m , n, k):
+        super(IdentityLayer3D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n, k))
+        torch.nn.init.xavier_normal_(self.weight)
+    def forward(self):
+        return self.weight
+
+
+def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
+                            hidden_size_per_att_head, dropout_prob, batch_size,
+                            sequence_length):
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * \
+                    torch.distributed.get_world_size()
+    hidden_size = hidden_size_per_att_head * num_att_heads
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
+                                                dropout_prob).cuda()
+    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = attention_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer
+
+
+def test_parallel_self_attention(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelSelfAttention with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    dropout_prob = 0.0 # has to be zero
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
+        attention_layer_1, identity_layer_1 =parallel_self_attention(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer =parallel_self_attention(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+    assert hideen_size_1 == hidden_size
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    my_lin_grad_list = torch.split(
+        attention_layer_1.query_key_value.weight.grad,
+        hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
+    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
+    error = my_lin_grad.sub(
+        attention_layer.query_key_value.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   weight gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
+                         hidden_size_per_att_head, batch_size, sequence_length):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * \
+                    torch.distributed.get_world_size()
+    hidden_size = hidden_size_per_att_head * num_att_heads
+    intermediate_size = 4 * hidden_size
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    transformer_layer = mpu.BertParallelTransformerLayer(
+        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
+        torch.nn.functional.relu, 1.0e-5).cuda()
+
+    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = transformer_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer
+
+
+def test_parallel_transformer_layer(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelTransformerLayer with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
+        transformer_layer_1, identity_layer_1 = parallel_transformer(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer = parallel_transformer(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    print_separator('test initialize affine weight')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_initialize_affine_weight(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test parallel embedding')
+        test_parallel_embedding(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test column-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_column_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test row-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_row_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel self-attention')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_self_attention(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel transformer')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_transformer_layer(model_parallel_size)
+        model_parallel_size *= 2
diff --git a/modelscope/utils/nlp/mpu/tests/test_random.py b/modelscope/utils/nlp/mpu/tests/test_random.py
new file mode 100755
index 00000000..e3792089
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/tests/test_random.py
@@ -0,0 +1,207 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append("../..")
+
+import torch
+import mpu
+
+from commons import initialize_distributed
+from commons import print_separator
+
+
+def test_set_cuda_rng_state(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing set_rng_state with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    size = 123
+    seed = 1234
+    torch.cuda.manual_seed(1234)
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Get the state
+    rng_state = torch.cuda.get_rng_state()
+    rng_state_copy = rng_state.clone()
+
+    # Do some stuff.
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_1 = tensor.clone()
+
+    assert rng_state.sub(rng_state_copy).max() == 0
+    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+
+    # State should be different.
+    new_rng_state = torch.cuda.get_rng_state()
+    max_diff = new_rng_state.sub(rng_state).max()
+    print('   max diff in rng state (should be non-zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), max_diff))
+    assert max_diff > 0
+
+    # Reset the rng state and do the same stuff.
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_2 = tensor.clone()
+
+    # Results should be the same
+    error = result_2.sub(result_1).abs().max()
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Input state should have remained intact.
+    error = rng_state.sub(rng_state_copy).max()
+    print('   max error in rng state (should be zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), error))
+    assert error == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_cuda_rng_tracker(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cuda rng tracker with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed_1 = 1234
+    seed_2 = 4321
+    size = [12, 21]
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Set to seed_1 and generate two tensors.
+    torch.cuda.manual_seed(seed_1)
+    torch.randn(size, out=tensor)
+    target_11 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_12 = tensor.clone()
+
+    # Set to seed_2 and generate two tensors.
+    torch.cuda.manual_seed(seed_2)
+    torch.randn(size, out=tensor)
+    target_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_22 = tensor.clone()
+
+    # Now if we interleave seed_1 and seed_2,
+    # we should still get the same tensors
+    torch.cuda.manual_seed(seed_1)
+    mpu.get_cuda_rng_tracker().add('test', seed_2)
+
+    torch.randn(size, out=tensor)
+    result_11 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_21 = tensor.clone()
+
+    torch.randn(size, out=tensor)
+    result_12 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_22 = tensor.clone()
+
+    diff = result_11.sub(result_21).abs().max()
+    diff = min(diff, result_12.sub(result_22).abs().max())
+    print('   max diff in generated tensors (should be non-zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
+    assert diff > 1.0e-6
+    error = max(result_11.sub(target_11).abs().max(),
+                result_12.sub(target_12).abs().max())
+    error = max(error, result_21.sub(target_21).abs().max())
+    error = max(error, result_22.sub(target_22).abs().max())
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_model_parallel_cuda_manual_seed(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing model parallel cuda manual seed with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    mpu.model_parallel_cuda_manual_seed(12345)
+    assert torch.cuda.initial_seed() == 12345
+    with mpu.get_cuda_rng_tracker().fork():
+        assert torch.cuda.initial_seed() == (12345 + 2718 +
+                                             mpu.get_model_parallel_rank())
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test set rng state')
+        test_set_cuda_rng_state(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cuda rng tracker')
+        test_cuda_rng_tracker(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test model parallel cuda manual seed')
+        test_model_parallel_cuda_manual_seed(model_parallel_size)
+        model_parallel_size *= 2
+
diff --git a/modelscope/utils/nlp/mpu/transformer.py b/modelscope/utils/nlp/mpu/transformer.py
new file mode 100755
index 00000000..aa16ff06
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/transformer.py
@@ -0,0 +1,1065 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer."""
+
+import math
+
+import torch
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+from .initialize import get_model_parallel_world_size
+from .layers import ColumnParallelLinear
+from .layers import RowParallelLinear
+from .mappings import gather_from_model_parallel_region
+
+import deepspeed
+
+from .random import checkpoint
+from .random import get_cuda_rng_tracker
+
+from .utils import divide
+from .utils import split_tensor_along_last_dim
+from ..utils import print_rank_0
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+
+
+
+class GPT2ParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_dropout_prob, output_dropout_prob,
+                 init_method, output_layer_init_method=None,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0,
+                 LR_weight_rank=8, LR_mask_rank=8):
+        super(GPT2ParallelSelfAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads,
+                                                        world_size)
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(hidden_size, 3*hidden_size,
+                                                    stride=3,
+                                                    gather_output=False,
+                                                    init_method=init_method,
+                                                    pruning_method=pruning_method,
+                                                    pruning_mask_init=pruning_mask_init,
+                                                    pruning_mask_scale=pruning_mask_scale, 
+                                                    LR_weight_rank=LR_weight_rank,
+                                                    LR_mask_rank=LR_mask_rank)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(hidden_size,
+                                       hidden_size,
+                                       input_is_parallel=True,
+                                       init_method=output_layer_init_method,
+                                       pruning_method=pruning_method,
+                                       pruning_mask_init=pruning_mask_init,
+                                       pruning_mask_scale=pruning_mask_scale, 
+                                       LR_weight_rank=LR_weight_rank,
+                                       LR_mask_rank=LR_mask_rank)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition,
+                            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, ltor_mask, is_infer=False, pruning_threshold=None,):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        tgt_len = hidden_states.size(1)
+        mixed_x_layer = self.query_key_value(hidden_states, pruning_threshold=pruning_threshold,)
+        (mixed_query_layer,
+         mixed_key_layer,
+         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        
+        previous_type = value_layer.type()
+
+        # Raw attention scores. [b, np, s, s]
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+        # Apply the left to right attention mask.
+        if is_infer:
+            src_len = key_layer.size(2)
+            ltor_mask = torch.tril(torch.ones(
+                        (1, tgt_len, src_len), device=hidden_states.device)).view(1, 1, tgt_len, src_len).type(previous_type)
+        attention_scores = torch.mul(attention_scores, ltor_mask) - \
+                           10000.0 * (1.0 - ltor_mask)
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer, pruning_threshold=pruning_threshold,)
+        output = self.output_dropout(output)
+
+        return output
+
+
+@torch.jit.script
+def gelu_impl(x):
+     """OpenAI's gelu implementation."""
+     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                        (1.0 + 0.044715 * x * x)))
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class GPT2ParallelMLP(torch.nn.Module):
+    """MLP for GPT2.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layer initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self, hidden_size, output_dropout_prob, init_method,
+                 output_layer_init_method=None,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0):
+        super(GPT2ParallelMLP, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Project to 4h.
+        self.dense_h_to_4h = ColumnParallelLinear(hidden_size, 4*hidden_size,
+                                                  gather_output=False,
+                                                  init_method=init_method,
+                                                  pruning_method=pruning_method,
+                                                  pruning_mask_init=pruning_mask_init,
+                                                  pruning_mask_scale=pruning_mask_scale, )
+        # Project back to h.
+        self.dense_4h_to_h = RowParallelLinear(
+            4*hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            pruning_method=pruning_method,
+            pruning_mask_init=pruning_mask_init,
+            pruning_mask_scale=pruning_mask_scale, )
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def forward(self, hidden_states, pruning_threshold=None,):
+        # [b, s, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states, pruning_threshold=pruning_threshold,)
+        intermediate_parallel = gelu(intermediate_parallel)
+
+        # [b, s, h]
+        output = self.dense_4h_to_h(intermediate_parallel, pruning_threshold=pruning_threshold,)
+        output = self.dropout(output)
+        return output
+
+
+class GPT2ParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 pruning_method=None,
+                 pruning_mask_init='constant',
+                 pruning_mask_scale=0.0):
+        super(GPT2ParallelTransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = GPT2ParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=pruning_method,
+            pruning_mask_init=pruning_mask_init,
+            pruning_mask_scale=pruning_mask_scale, )
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = LayerNorm(hidden_size,
+                                                  eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = GPT2ParallelMLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=pruning_method,
+            pruning_mask_init=pruning_mask_init,
+            pruning_mask_scale=pruning_mask_scale, )
+
+    def forward(self, hidden_states, ltor_mask, pruning_threshold=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask, pruning_threshold=pruning_threshold,)
+        # Residual connection.
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output, pruning_threshold=pruning_threshold,)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GPT2ParallelTransformer(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+    def __init__(self,
+                 num_layers,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 layernorm_epsilon=1.0e-5,
+                 init_method_std=0.02,
+                 use_scaled_init_for_output_weights=True,
+                 pruning_method=None,
+                 pruning_mask_init='constant',
+                 pruning_mask_scale=0.0):
+        super(GPT2ParallelTransformer, self).__init__()
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(init_method_std,
+                                                          num_layers)
+        def get_layer():
+            return GPT2ParallelTransformerLayer(
+                hidden_size,
+                num_attention_heads,
+                attention_dropout_prob,
+                output_dropout_prob,
+                layernorm_epsilon,
+                unscaled_init_method(init_method_std),
+                output_layer_init_method=output_layer_init_method,
+                pruning_method=pruning_method,
+                pruning_mask_init=pruning_mask_init,
+                pruning_mask_scale=pruning_mask_scale, )
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+
+    def forward(self, hidden_states, attention_mask, pruning_threshold=None):
+
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers_ = self.layers[start:end]
+                x_ = inputs[0]
+                for layer in layers_:
+                    x_ = layer(x_, inputs[1])
+                return x_
+            return custom_forward
+
+        if self.checkpoint_activations:
+            l = 0
+            num_layers = len(self.layers)
+            chunk_length = self.checkpoint_num_layers
+            while l < num_layers:
+                hidden_states = checkpoint(custom(l, l+chunk_length),
+                                           hidden_states, attention_mask)
+                l += chunk_length
+        else:
+            for layer in self.layers:
+                hidden_states = layer(hidden_states, attention_mask, pruning_threshold=pruning_threshold,)
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+
+        return output
+
+
+class BertParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for BERT.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        output_parallel: If true, no all-gather is done on the output and
+                         the output values will be per partition.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+    def __init__(self, hidden_size, num_attention_heads,
+                 dropout_prob, output_parallel=False,
+                 init_method=init.xavier_normal_, separate = False,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0, pruning_module=None,
+                 LR_weight_rank=8, LR_mask_rank=8):
+        super(BertParallelSelfAttention, self).__init__()
+        # Input configuration.
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.dropout_prob = dropout_prob
+        self.output_parallel = output_parallel
+        self.separate = separate
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads,
+                                                        world_size)
+        # Strided linear layer.
+        if not separate:
+            self.query_key_value = ColumnParallelLinear(hidden_size, 3*hidden_size,
+                                                        stride=3,
+                                                        gather_output=False,
+                                                        init_method=init_method,
+                                                        pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self'] else None,
+                                                        pruning_mask_init=pruning_mask_init,
+                                                        pruning_mask_scale=pruning_mask_scale,
+                                                        LR_weight_rank=LR_weight_rank,
+                                                        LR_mask_rank=LR_mask_rank)
+        else:
+            self.query = ColumnParallelLinear(hidden_size, hidden_size, stride=1,
+                                                gather_output=False,
+                                                init_method=init_method,
+                                                pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfqk'] else None,
+                                                pruning_mask_init=pruning_mask_init,
+                                                pruning_mask_scale=pruning_mask_scale,
+                                                LR_weight_rank=LR_weight_rank,
+                                                LR_mask_rank=LR_mask_rank)
+            self.key = ColumnParallelLinear(hidden_size, hidden_size, stride=1,
+                                                gather_output=False,
+                                                init_method=init_method,
+                                                pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfqk'] else None,
+                                                pruning_mask_init=pruning_mask_init,
+                                                pruning_mask_scale=pruning_mask_scale,
+                                                LR_weight_rank=LR_weight_rank,
+                                                LR_mask_rank=LR_mask_rank)
+            self.value = ColumnParallelLinear(hidden_size, hidden_size, stride=1,
+                                                gather_output=False,
+                                                init_method=init_method,
+                                                pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfvo'] else None,
+                                                pruning_mask_init=pruning_mask_init,
+                                                pruning_mask_scale=pruning_mask_scale,
+                                                LR_weight_rank=LR_weight_rank,
+                                                LR_mask_rank=LR_mask_rank)
+            # self.key_value = ColumnParallelLinear(hidden_size, 2*hidden_size, stride=2,
+            #                                     gather_output=False,
+            #                                     init_method=init_method,
+            #                                     pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfqk'] else None,
+            #                                     pruning_mask_init=pruning_mask_init,
+            #                                     pruning_mask_scale=pruning_mask_scale,)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.dropout = torch.nn.Dropout(dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+        self.timers = SynchronizedWallClockTimer() 
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition,
+                            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, pruning_threshold=None,):
+        
+        if pruning_threshold is not None:
+            query_pruning_threshold = pruning_threshold
+            key_pruning_threshold = pruning_threshold
+            # value_pruning_threshold = pruning_threshold if pruning_threshold > 0.03 else 0.03
+            # value_pruning_threshold = 1 - (1 - pruning_threshold)/0.99*0.95
+            value_pruning_threshold = pruning_threshold
+
+        # Attention heads. [b, s, hp]
+        self.timers('self attention').start()
+        if not self.separate:
+            mixed_x_layer = self.query_key_value(hidden_states, pruning_threshold=pruning_threshold)
+            (mixed_query_layer,
+            mixed_key_layer,
+            mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+        else:
+            mixed_query_layer = self.query(hidden_states, pruning_threshold=pruning_threshold)
+            # mixed_key_value_layer = self.key_value(hidden_states, pruning_threshold=key_pruning_threshold)
+            # (mixed_key_layer, mixed_value_layer) = split_tensor_along_last_dim(mixed_key_value_layer, 2)
+            mixed_key_layer = self.key(hidden_states, pruning_threshold=pruning_threshold)
+            mixed_value_layer = self.value(hidden_states, pruning_threshold=pruning_threshold)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+
+        # Raw attention scores. [b, np, s, s]
+        norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
+        attention_scores = torch.matmul(query_layer/norm_factor,
+                                        key_layer.transpose(-1, -2)/norm_factor)
+        # Apply the attention mask.
+        attention_scores += attention_mask
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        self.timers('self attention').stop()
+        # Output. [b, s, h]
+        self.timers('self attention gather').start()
+        if self.output_parallel:
+            output = context_layer
+        else:
+            output = gather_from_model_parallel_region(context_layer)
+        self.timers('self attention gather').stop()
+        timer_names = ['self attention', 'self attention gather']
+        #self.timers.log(names=timer_names)
+        return output
+
+
+class BertParallelTransformerOutput(torch.nn.Module):
+    """The output layer used after self attention and intermediate
+    parts of transformer layer."""
+    def __init__(self, input_size, output_size, dropout_prob,
+                 layernorm_epsilon=1.0e-12, input_is_parallel=False,
+                 init_method=init.xavier_normal_,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0):
+        super(BertParallelTransformerOutput, self).__init__()
+        # Components.
+        self.dense = RowParallelLinear(input_size,
+                                       output_size,
+                                       input_is_parallel=input_is_parallel,
+                                       init_method=init_method,
+                                       pruning_method=pruning_method,
+                                       pruning_mask_init=pruning_mask_init,
+                                       pruning_mask_scale=pruning_mask_scale,)
+        self.dropout = torch.nn.Dropout(dropout_prob)
+        self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon)
+
+    def forward(self, hidden_states, input_tensor, pruning_threshold=None,):
+        hidden_states = self.dense(hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.dropout(hidden_states)
+        layernorm_input = hidden_states + input_tensor
+        hidden_states = self.layernorm(layernorm_input)
+        return hidden_states
+
+
+class BertParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for Bert.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        intermediate_size: size of the intermediate state after
+                           self attention. In both BERT and GPT
+                           this is set to be 4 times the hidden
+                           size.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        intermediate_activation_fn: activation function for output
+                                    of intermediate.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+    """
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 intermediate_activation_fn,
+                 layernorm_epsilon,
+                 init_method=init.xavier_normal_,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0):
+        super(BertParallelTransformerLayer, self).__init__()
+
+        # Self attention.
+        self.attention = BertParallelSelfAttention(hidden_size,
+                                                   num_attention_heads,
+                                                   attention_dropout_prob,
+                                                   output_parallel=True,
+                                                   init_method=init_method,
+                                                   pruning_method=pruning_method,
+                                                   pruning_mask_init=pruning_mask_init,
+                                                   pruning_mask_scale=pruning_mask_scale, )
+        # Self attention output.
+        self.self_output = BertParallelTransformerOutput(
+            hidden_size, hidden_size, output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method,
+            pruning_method=pruning_method,
+            pruning_mask_init=pruning_mask_init,
+            pruning_mask_scale=pruning_mask_scale, )
+        # Intermediate.
+        self.intermediate = ColumnParallelLinear(hidden_size, intermediate_size,
+                                                 gather_output=False,
+                                                 init_method=init_method,
+                                                 pruning_method=pruning_method,
+                                                 pruning_mask_init=pruning_mask_init,
+                                                 pruning_mask_scale=pruning_mask_scale,)
+        self.intermediate_activation_fn = intermediate_activation_fn
+        # Output.
+        self.output = BertParallelTransformerOutput(
+            intermediate_size, hidden_size, output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method,
+            pruning_method=pruning_method,
+            pruning_mask_init=pruning_mask_init,
+            pruning_mask_scale=pruning_mask_scale, )
+
+    def forward(self, hidden_states, attention_mask, pruning_threshold=None,):
+        # [b, s, hp]
+        attention_output_parallel = self.attention(hidden_states,
+                                                   attention_mask,
+                                                   pruning_threshold=pruning_threshold)
+        # [b, s, h]
+        attention_self_output = self.self_output(attention_output_parallel,
+                                                 hidden_states, pruning_threshold=pruning_threshold)
+        # [b, s, ip]
+        intermediate_output_parallel = self.intermediate(attention_self_output, pruning_threshold=pruning_threshold)
+        intermediate_output_parallel = self.intermediate_activation_fn(
+            intermediate_output_parallel)
+        # [b, s, h]
+        layer_output = self.output(intermediate_output_parallel,
+                                   attention_self_output, pruning_threshold=pruning_threshold)
+
+        return layer_output
+
+class PalmParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_dropout_prob, output_dropout_prob,
+                 init_method, output_layer_init_method=None,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0):
+        super(GPT2ParallelSelfAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads,
+                                                        world_size)
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(hidden_size, 3*hidden_size,
+                                                    stride=3,
+                                                    gather_output=False,
+                                                    init_method=init_method,
+                                                    pruning_method=pruning_method,
+                                                    pruning_mask_init=pruning_mask_init,
+                                                    pruning_mask_scale=pruning_mask_scale,)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(hidden_size,
+                                       hidden_size,
+                                       input_is_parallel=True,
+                                       init_method=output_layer_init_method,
+                                       pruning_method=pruning_method,
+                                       pruning_mask_init=pruning_mask_init,
+                                       pruning_mask_scale=pruning_mask_scale, )
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition,
+                            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, ltor_mask, pruning_threshold=None,):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        mixed_x_layer = self.query_key_value(hidden_states, pruning_threshold=pruning_threshold,)
+        (mixed_query_layer,
+         mixed_key_layer,
+         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+
+        # Raw attention scores. [b, np, s, s]
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+        # Apply the left to right attention mask.
+        attention_scores = torch.mul(attention_scores, ltor_mask) - \
+                           10000.0 * (1.0 - ltor_mask)
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer, pruning_threshold=pruning_threshold,)
+        output = self.output_dropout(output)
+
+        return output
+
+class PalmParallelCrossAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_dropout_prob, output_dropout_prob,
+                 init_method, output_layer_init_method=None, attn_separate=False,
+                 pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0, pruning_module='all',
+                 LR_weight_rank=8, LR_mask_rank=8):
+        super(PalmParallelCrossAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads,
+                                                        world_size)
+        # Strided linear layer.
+        self.query = ColumnParallelLinear(hidden_size, hidden_size,
+                                                    stride=1,
+                                                    gather_output=False,
+                                                    init_method=init_method,
+                                                    pruning_method=pruning_method if pruning_module in ['all', 'decoder', 'decoder_cross', 'cross'] else None,
+                                                    pruning_mask_init=pruning_mask_init,
+                                                    pruning_mask_scale=pruning_mask_scale, 
+                                                    LR_weight_rank=LR_weight_rank,
+                                                    LR_mask_rank=LR_mask_rank)
+        
+        if not attn_separate:
+            self.key_value = ColumnParallelLinear(hidden_size, 2*hidden_size,
+                                                        stride=2,
+                                                        gather_output=False,
+                                                        init_method=init_method,
+                                                        pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_cross', 'cross'] else None,
+                                                        pruning_mask_init=pruning_mask_init,
+                                                        pruning_mask_scale=pruning_mask_scale, 
+                                                        LR_weight_rank=LR_weight_rank,
+                                                        LR_mask_rank=LR_mask_rank)
+        else:
+            self.key = ColumnParallelLinear(hidden_size, hidden_size,
+                                            stride=1,
+                                            gather_output=False,
+                                            init_method=init_method,
+                                            pruning_method=pruning_method if pruning_module in ['all', 'decoder', 'decoder_cross', 'cross'] else None,
+                                            pruning_mask_init=pruning_mask_init,
+                                            pruning_mask_scale=pruning_mask_scale, 
+                                            LR_weight_rank=LR_weight_rank,
+                                            LR_mask_rank=LR_mask_rank)
+            self.value = ColumnParallelLinear(hidden_size, hidden_size,
+                                            stride=1,
+                                            gather_output=False,
+                                            init_method=init_method,
+                                            pruning_method=pruning_method if pruning_module in ['all', 'decoder', 'decoder_cross', 'cross'] else None,
+                                            pruning_mask_init=pruning_mask_init,
+                                            pruning_mask_scale=pruning_mask_scale, 
+                                            LR_weight_rank=LR_weight_rank,
+                                            LR_mask_rank=LR_mask_rank)
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(hidden_size,
+                                       hidden_size,
+                                       input_is_parallel=True,
+                                       init_method=output_layer_init_method,
+                                       pruning_method=pruning_method if pruning_module in ['all', 'decoder', 'decoder_cross', 'cross'] else None,
+                                       pruning_mask_init=pruning_mask_init,
+                                       pruning_mask_scale=pruning_mask_scale, 
+                                       LR_weight_rank=LR_weight_rank,
+                                       LR_mask_rank=LR_mask_rank)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+        self.attn_separate = attn_separate
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition,
+                            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, query, enc_hidden_states, enc_attn_mask, pruning_threshold=None,):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        if pruning_threshold is not None:
+            key_pruning_threshold = pruning_threshold
+            # value_pruning_threshold = pruning_threshold if pruning_threshold > 0.03 else 0.03
+            # dense_pruning_threshold = pruning_threshold if pruning_threshold > 0.03 else 0.03
+            # value_pruning_threshold = 1 - (1 - pruning_threshold)/0.99*0.95
+            # dense_pruning_threshold = 1 - (1 - pruning_threshold)/0.99*0.95
+
+            value_pruning_threshold = pruning_threshold
+            dense_pruning_threshold = pruning_threshold
+
+        # Attention heads. [b, s, hp]
+        mixed_query_layer = self.query(query, pruning_threshold=pruning_threshold,)
+        #print_rank_0(enc_hidden_states.size())
+        if not self.attn_separate:
+            mixed_x_layer = self.key_value(enc_hidden_states, pruning_threshold=pruning_threshold,)
+            (mixed_key_layer, mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 2)
+        else:
+            mixed_key_layer = self.key(enc_hidden_states, pruning_threshold=pruning_threshold,)
+            mixed_value_layer = self.value(enc_hidden_states, pruning_threshold=pruning_threshold,)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+
+        # Raw attention scores. [b, np, s, s]
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+        attention_scores += enc_attn_mask
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer, pruning_threshold=pruning_threshold,)
+        output = self.output_dropout(output)
+
+        return output
diff --git a/modelscope/utils/nlp/mpu/utils.py b/modelscope/utils/nlp/mpu/utils.py
new file mode 100755
index 00000000..94afafd5
--- /dev/null
+++ b/modelscope/utils/nlp/mpu/utils.py
@@ -0,0 +1,70 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor, num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                  rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size)
diff --git a/modelscope/utils/nlp/utils.py b/modelscope/utils/nlp/utils.py
new file mode 100644
index 00000000..19bba16a
--- /dev/null
+++ b/modelscope/utils/nlp/utils.py
@@ -0,0 +1,109 @@
+
+import os
+import random
+import time
+import numpy as np
+import torch
+
+"""Utilities for logging and serialization"""
+
+def get_log_constant(user_log):
+        return '[user log]' if user_log else ''
+
+def print_rank_0(message):
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+
+def print_args(args):
+    """Print arguments."""
+
+    print('arguments:', flush=True)
+    for arg in vars(args):
+        dots = '.' * (29 - len(arg))
+        print('  {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(
+        torch.cuda.memory_allocated() / mega_bytes)
+    string += ' | max allocated: {}'.format(
+        torch.cuda.max_memory_allocated() / mega_bytes)
+    string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
+    string += ' | max cached: {}'.format(
+        torch.cuda.max_memory_cached()/ mega_bytes)
+    print_rank_0(string)
+
+
+class Timers:
+    """Group of timers."""
+
+    class Timer:
+        """Timer."""
+
+        def __init__(self, name):
+            self.name_ = name
+            self.elapsed_ = 0.0
+            self.started_ = False
+            self.start_time = time.time()
+
+        def start(self):
+            """Start the timer."""
+            assert not self.started_, 'timer has already been started'
+            torch.cuda.synchronize()
+            self.start_time = time.time()
+            self.started_ = True
+
+        def stop(self):
+            """Stop the timer."""
+            assert self.started_, 'timer is not started'
+            torch.cuda.synchronize()
+            self.elapsed_ += (time.time() - self.start_time)
+            self.started_ = False
+
+        def reset(self):
+            """Reset timer."""
+            self.elapsed_ = 0.0
+            self.started_ = False
+
+        def elapsed(self, reset=True):
+            """Calculate the elapsed time."""
+            started_ = self.started_
+            # If the timing in progress, end it first.
+            if self.started_:
+                self.stop()
+            # Get the elapsed time.
+            elapsed_ = self.elapsed_
+            # Reset the elapsed time
+            if reset:
+                self.reset()
+            # If timing was in progress, set it back.
+            if started_:
+                self.start()
+            return elapsed_
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = self.Timer(name)
+        return self.timers[name]
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0/ normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        print_rank_0(string)
+
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index 1f157f9a..ee9dde13 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -50,13 +50,13 @@ def _init_dist_pytorch(backend: str, **kwargs) -> None:
     # rank = int(os.environ['RANK'])
     local_rank = int(os.environ['LOCAL_RANK'])
 
-    torch.cuda.set_device(local_rank)
+    #torch.cuda.set_device(local_rank)
     dist.init_process_group(backend=backend, **kwargs)
 
 
 def _init_dist_mpi(backend: str, **kwargs) -> None:
     local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
-    torch.cuda.set_device(local_rank)
+    #torch.cuda.set_device(local_rank)
     if 'MASTER_PORT' not in os.environ:
         # 29500 is torch.distributed default port
         os.environ['MASTER_PORT'] = '29500'
@@ -82,7 +82,7 @@ def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None:
     ntasks = int(os.environ['SLURM_NTASKS'])
     node_list = os.environ['SLURM_NODELIST']
     num_gpus = torch.cuda.device_count()
-    torch.cuda.set_device(proc_id % num_gpus)
+    #torch.cuda.set_device(proc_id % num_gpus)
     addr = subprocess.getoutput(
         f'scontrol show hostname {node_list} | head -n1')
     # specify master port
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index c08209a4..3a56a941 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -3,7 +3,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration
+from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration, PlugForTextGeneration
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextGenerationPipeline
 from modelscope.preprocessors import TextGenerationPreprocessor
@@ -34,83 +34,15 @@ class TextGenerationTest(unittest.TestCase):
         self.gpt3_large_model_id = 'damo/nlp_gpt3_text-generation_chinese-large'
         self.gpt3_input = '《故乡》。深蓝的天空中挂着一轮金黄的圆月，下面是海边的沙地，'
 
-    def run_pipeline_with_model_instance(self, model_id, input):
-        model = Model.from_pretrained(model_id)
-        preprocessor = TextGenerationPreprocessor(
-            model.model_dir,
-            model.tokenizer,
-            first_sequence='sentence',
-            second_sequence=None)
-        pipeline_ins = pipeline(
-            task=Tasks.text_generation, model=model, preprocessor=preprocessor)
-        print(pipeline_ins(input))
+        self.plug_model_id = 'damo/nlp_plug_text-generation_chinese'
+        self.plug_input = '段誉轻挥折扇，摇了摇头，说'
 
-    def run_pipeline_with_model_id(self, model_id, input):
-        pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
-        print(pipeline_ins(input))
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_palm_zh_with_model_name(self):
-        self.run_pipeline_with_model_id(self.palm_model_id_zh,
-                                        self.palm_input_zh)
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_palm_en_with_model_name(self):
-        self.run_pipeline_with_model_id(self.palm_model_id_en,
-                                        self.palm_input_en)
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_gpt_base_with_model_name(self):
-        self.run_pipeline_with_model_id(self.gpt3_base_model_id,
-                                        self.gpt3_input)
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_gpt_large_with_model_name(self):
-        self.run_pipeline_with_model_id(self.gpt3_large_model_id,
-                                        self.gpt3_input)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_palm_zh_with_model_instance(self):
-        self.run_pipeline_with_model_instance(self.palm_model_id_zh,
-                                              self.palm_input_zh)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_palm_en_with_model_instance(self):
-        self.run_pipeline_with_model_instance(self.palm_model_id_en,
-                                              self.palm_input_en)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_gpt_base_with_model_instance(self):
-        self.run_pipeline_with_model_instance(self.gpt3_base_model_id,
-                                              self.gpt3_input)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_gpt_large_with_model_instance(self):
-        self.run_pipeline_with_model_instance(self.gpt3_large_model_id,
-                                              self.gpt3_input)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_palm(self):
-        for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh),
-                                (self.palm_model_id_en, self.palm_input_en)):
-            cache_path = snapshot_download(model_id)
-            model = PalmForTextGeneration.from_pretrained(cache_path)
-            preprocessor = TextGenerationPreprocessor(
-                cache_path,
-                model.tokenizer,
-                first_sequence='sentence',
-                second_sequence=None)
-            pipeline1 = TextGenerationPipeline(model, preprocessor)
-            pipeline2 = pipeline(
-                Tasks.text_generation, model=model, preprocessor=preprocessor)
-            print(
-                f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
-            )
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_gpt3(self):
-        cache_path = snapshot_download(self.gpt3_base_model_id)
-        model = GPT3ForTextGeneration(cache_path)
+    def test_plug(self):
+        import torch
+        print("start_method", str(torch.multiprocessing.get_start_method(allow_none=True)))
+        torch.multiprocessing.set_start_method("spawn")
+        cache_path = "/home/suluyan.sly/model/plug_model"
+        model = PlugForTextGeneration(cache_path)
         preprocessor = TextGenerationPreprocessor(
             cache_path,
             model.tokenizer,
@@ -120,13 +52,102 @@ class TextGenerationTest(unittest.TestCase):
         pipeline2 = pipeline(
             Tasks.text_generation, model=model, preprocessor=preprocessor)
         print(
-            f'pipeline1: {pipeline1(self.gpt3_input)}\npipeline2: {pipeline2(self.gpt3_input)}'
+            f'pipeline1: {pipeline1(self.plug_input)}\npipeline2: {pipeline2(self.plug_input)}'
         )
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.text_generation)
-        print(pipeline_ins(self.palm_input_zh))
+    # def run_pipeline_with_model_instance(self, model_id, input):
+    #     model = Model.from_pretrained(model_id)
+    #     preprocessor = TextGenerationPreprocessor(
+    #         model.model_dir,
+    #         model.tokenizer,
+    #         first_sequence='sentence',
+    #         second_sequence=None)
+    #     pipeline_ins = pipeline(
+    #         task=Tasks.text_generation, model=model, preprocessor=preprocessor)
+    #     print(pipeline_ins(input))
+
+    # def run_pipeline_with_model_id(self, model_id, input):
+    #     pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
+    #     print(pipeline_ins(input))
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    # def test_palm_zh_with_model_name(self):
+    #     self.run_pipeline_with_model_id(self.palm_model_id_zh,
+    #                                     self.palm_input_zh)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    # def test_palm_en_with_model_name(self):
+    #     self.run_pipeline_with_model_id(self.palm_model_id_en,
+    #                                     self.palm_input_en)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    # def test_gpt_base_with_model_name(self):
+    #     self.run_pipeline_with_model_id(self.gpt3_base_model_id,
+    #                                     self.gpt3_input)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    # def test_gpt_large_with_model_name(self):
+    #     self.run_pipeline_with_model_id(self.gpt3_large_model_id,
+    #                                     self.gpt3_input)
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_palm_zh_with_model_instance(self):
+    #     self.run_pipeline_with_model_instance(self.palm_model_id_zh,
+    #                                           self.palm_input_zh)
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_palm_en_with_model_instance(self):
+    #     self.run_pipeline_with_model_instance(self.palm_model_id_en,
+    #                                           self.palm_input_en)
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_gpt_base_with_model_instance(self):
+    #     self.run_pipeline_with_model_instance(self.gpt3_base_model_id,
+    #                                           self.gpt3_input)
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_gpt_large_with_model_instance(self):
+    #     self.run_pipeline_with_model_instance(self.gpt3_large_model_id,
+    #                                           self.gpt3_input)
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_run_palm(self):
+    #     for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh),
+    #                             (self.palm_model_id_en, self.palm_input_en)):
+    #         cache_path = snapshot_download(model_id)
+    #         model = PalmForTextGeneration.from_pretrained(cache_path)
+    #         preprocessor = TextGenerationPreprocessor(
+    #             cache_path,
+    #             model.tokenizer,
+    #             first_sequence='sentence',
+    #             second_sequence=None)
+    #         pipeline1 = TextGenerationPipeline(model, preprocessor)
+    #         pipeline2 = pipeline(
+    #             Tasks.text_generation, model=model, preprocessor=preprocessor)
+    #         print(
+    #             f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
+    #         )
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_run_gpt3(self):
+    #     cache_path = snapshot_download(self.gpt3_base_model_id)
+    #     model = GPT3ForTextGeneration(cache_path)
+    #     preprocessor = TextGenerationPreprocessor(
+    #         cache_path,
+    #         model.tokenizer,
+    #         first_sequence='sentence',
+    #         second_sequence=None)
+    #     pipeline1 = TextGenerationPipeline(model, preprocessor)
+    #     pipeline2 = pipeline(
+    #         Tasks.text_generation, model=model, preprocessor=preprocessor)
+    #     print(
+    #         f'pipeline1: {pipeline1(self.gpt3_input)}\npipeline2: {pipeline2(self.gpt3_input)}'
+    #     )
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_run_with_default_model(self):
+    #     pipeline_ins = pipeline(task=Tasks.text_generation)
+    #     print(pipeline_ins(self.palm_input_zh))
 
 
 if __name__ == '__main__':