init

2026-02-24 12:10:09 +01:00 · 2022-11-09 16:54:52 +08:00
parent 42011e48d3
commit 0399b91e92
55 changed files with 14875 additions and 2 deletions
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -84,6 +84,7 @@ class Models(object):
    T5 = 'T5'
    mglm = 'mglm'
    bloom = 'bloom'
+    txl = 'txl'

    # audio models
    sambert_hifigan = 'sambert-hifigan'
@@ -253,6 +254,7 @@ class Pipelines(object):
    document_segmentation = 'document-segmentation'
    feature_extraction = 'feature-extraction'
    mglm_text_summarization = 'mglm-text-summarization'
+    txl_fast_poem = 'txl-fast-poem'
    translation_en_to_de = 'translation_en_to_de'  # keep it underscore
    translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
    translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
@@ -379,6 +381,7 @@ class Preprocessors(object):
    document_segmentation = 'document-segmentation'
    feature_extraction = 'feature-extraction'
    mglm_summarization = 'mglm-summarization'
+    txl_fast_poem = 'txl-fast-poem'
    sentence_piece = 'sentence-piece'

    # audio preprocessor
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -36,6 +36,7 @@ if TYPE_CHECKING:
    )
    from .T5 import T5ForConditionalGeneration
    from .mglm import MGLMForTextSummarization
+    from .txl_poem import TXLForFastPoem
    from .task_models import (
        FeatureExtractionModel,
        InformationExtractionModel,
@@ -108,6 +109,7 @@ else:
        'sentence_embedding': ['SentenceEmbedding'],
        'T5': ['T5ForConditionalGeneration'],
        'mglm': ['MGLMForTextSummarization'],
+        'txl_poem': ['TXLForFastPoem'],
        'gpt_neo': ['GPTNeoModel'],
        'bloom': ['BloomModel'],
    }
--- a/modelscope/models/nlp/txl_poem/init.py
+++ b/modelscope/models/nlp/txl_poem/init.py
@@ -0,0 +1,22 @@
+# Modified by Zhipu.AI
+# Original Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .txl_for_fast_poem import TXLForFastPoem
+else:
+    _import_structure = {
+        'txl_for_fast_poem': ['TXLForFastPoem'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
--- a/modelscope/models/nlp/txl_poem/arguments.py
+++ b/modelscope/models/nlp/txl_poem/arguments.py
@@ -0,0 +1,946 @@
+# Modified by Zhipu.AI
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""argparser configuration"""
+
+import argparse
+import os
+import subprocess
+
+import deepspeed
+import json
+import torch
+
+
+def get_hostname():
+    hostname_cmd = ['hostname -I']
+    result = subprocess.check_output(hostname_cmd, shell=True)
+    master_addr = result.decode('utf-8').split()[0]
+    return master_addr
+
+
+def add_model_config_args(parser):
+    """Model arguments"""
+
+    group = parser.add_argument_group('model', 'model configuration')
+
+    group.add_argument(
+        '--transformer-xl',
+        action='store_true',
+        help='use transformer-xl for training')
+    group.add_argument(
+        '--pretrained-bert',
+        action='store_true',
+        help='use a pretrained bert-large-uncased model instead'
+        'of initializing from scratch. See '
+        '--tokenizer-model-type to specify which pretrained '
+        'BERT model to use')
+    group.add_argument(
+        '--encoder-decoder',
+        action='store_true',
+        help='use the encoder-decoder architecture for blocklm')
+    group.add_argument(
+        '--attention-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for attention weights')
+    group.add_argument(
+        '--num-attention-heads',
+        type=int,
+        default=16,  # yuandong64
+        help='num of transformer attention heads')
+    group.add_argument(
+        '--hidden-size',
+        type=int,
+        default=1024,  # yuandong4096
+        help='tansformer hidden size')
+    group.add_argument(
+        '--intermediate-size',
+        type=int,
+        default=None,
+        help='transformer embedding dimension for FFN'
+        'set to 4*`--hidden-size` if it is None')
+    group.add_argument(
+        '--num-layers',
+        type=int,
+        default=24,  # yuandong48
+        help='num decoder layers')
+    group.add_argument(
+        '--layernorm-epsilon',
+        type=float,
+        default=1e-5,
+        help='layer norm epsilon')
+    group.add_argument(
+        '--hidden-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for hidden state transformer')
+    group.add_argument(
+        '--output-dropout',
+        type=float,
+        default=0.1,
+        help='dropout probability for pooled output')
+    group.add_argument(
+        '--max-position-embeddings',
+        type=int,
+        default=512,  # yuandong1024
+        help='maximum number of position embeddings to use')
+    group.add_argument(
+        '--max-sequence-length',
+        type=int,
+        default=512,
+        help='maximum number of position embeddings to use')
+    group.add_argument(
+        '--vocab-size',
+        type=int,
+        default=30522,
+        help='vocab size to use for non-character-level '
+        'tokenization. This value will only be used when '
+        'creating a tokenizer')
+    group.add_argument(
+        '--deep-init',
+        action='store_true',
+        help='initialize bert model similar to gpt2 model.'
+        'scales initialization of projection layers by a '
+        'factor of 1/sqrt(2N). Necessary to train bert '
+        'models larger than BERT-Large.')
+    group.add_argument(
+        '--make-vocab-size-divisible-by',
+        type=int,
+        default=128,
+        help='Pad the vocab size to be divisible by this value.'
+        'This is added for computational efficieny reasons.')
+    group.add_argument(
+        '--cpu-optimizer', action='store_true', help='Run optimizer on CPU')
+    group.add_argument(
+        '--cpu_torch_adam',
+        action='store_true',
+        help='Use Torch Adam as optimizer on CPU.')
+    group.add_argument(
+        '--sandwich-ln',
+        action='store_true',
+        help='add sandwich ln in cogview.')
+    return parser
+
+
+def add_fp16_config_args(parser):
+    """Mixed precision arguments."""
+
+    group = parser.add_argument_group('fp16', 'fp16 configurations')
+
+    group.add_argument(
+        '--fp16', action='store_true', help='Run model in fp16 mode')
+    group.add_argument(
+        '--fp32-embedding', action='store_true', help='embedding in fp32')
+    group.add_argument(
+        '--fp32-layernorm', action='store_true', help='layer norm in fp32')
+    group.add_argument(
+        '--fp32-tokentypes',
+        action='store_true',
+        help='embedding token types in fp32')
+    group.add_argument(
+        '--fp32-allreduce', action='store_true', help='all-reduce in fp32')
+    group.add_argument(
+        '--hysteresis',
+        type=int,
+        default=2,
+        help='hysteresis for dynamic loss scaling')
+    group.add_argument(
+        '--loss-scale',
+        type=float,
+        default=None,
+        help='Static loss scaling, positive power of 2 '
+        'values can improve fp16 convergence. If None, dynamic'
+        'loss scaling is used.')
+    group.add_argument(
+        '--loss-scale-window',
+        type=float,
+        default=1000,
+        help='Window over which to raise/lower dynamic scale')
+    group.add_argument(
+        '--min-scale',
+        type=float,
+        default=1,
+        help='Minimum loss scale for dynamic loss scale')
+    group.add_argument('--attention-scale', type=float, default=1.0)
+    return parser
+
+
+def add_training_args(parser):
+    """Training arguments."""
+
+    group = parser.add_argument_group('train', 'training configurations')
+
+    group.add_argument(
+        '--experiment-name',
+        type=str,
+        default='gpt-345M',
+        help='The experiment name for summary and checkpoint')
+    group.add_argument(
+        '--batch-size', type=int, default=4, help='Data Loader batch size')
+    group.add_argument(
+        '--gradient-accumulation-steps',
+        type=int,
+        default=1,
+        help='Data Loader batch size')
+    group.add_argument(
+        '--weight-decay',
+        type=float,
+        default=0.01,
+        help='weight decay coefficient for L2 regularization')
+    group.add_argument(
+        '--checkpoint-activations',
+        action='store_true',
+        help='checkpoint activation to allow for training '
+        'with larger models and sequences')
+    group.add_argument(
+        '--checkpoint-num-layers',
+        type=int,
+        default=1,
+        help='chunk size (number of layers) for checkpointing')
+    group.add_argument(
+        '--deepspeed-activation-checkpointing',
+        action='store_true',
+        help='uses activation checkpointing from deepspeed')
+    group.add_argument(
+        '--epochs',
+        type=int,
+        default=None,
+        help='Number of finetunning epochs. Zero results in evaluation only.')
+    group.add_argument(
+        '--clip-grad', type=float, default=1.0, help='gradient clipping')
+    group.add_argument(
+        '--train-iters',
+        type=int,
+        default=0,  # 1000000->0
+        help='total number of iterations to train over all training runs')
+    group.add_argument('--label-smoothing', type=float, default=0.0)
+    group.add_argument(
+        '--log-interval', type=int, default=100, help='report interval')
+    group.add_argument(
+        '--exit-interval',
+        type=int,
+        default=None,
+        help='Exit the program after this many new iterations.')
+    group.add_argument(
+        '--summary-dir',
+        type=str,
+        default='',
+        help='The directory to store the summary')
+    group.add_argument('--seed', type=int, default=1234, help='random seed')
+    # Batch prodecuer arguments
+    group.add_argument(
+        '--reset-position-ids',
+        action='store_true',
+        help='Reset posistion ids after end-of-document token.')
+    group.add_argument(
+        '--reset-attention-mask',
+        action='store_true',
+        help='Reset self attention maske after '
+        'end-of-document token.')
+
+    # Learning rate.
+    group.add_argument(
+        '--lr-decay-iters',
+        type=int,
+        default=None,
+        help='number of iterations to decay LR over,'
+        ' If None defaults to `--train-iters`*`--epochs`')
+    group.add_argument(
+        '--lr-decay-style',
+        type=str,
+        default='linear',
+        choices=['constant', 'linear', 'cosine', 'exponential'],
+        help='learning rate decay function')
+    group.add_argument('--lr-decay-ratio', type=float, default=0.5)
+    group.add_argument(
+        '--lr', type=float, default=1.0e-4, help='initial learning rate')
+    group.add_argument(
+        '--warmup',
+        type=float,
+        default=0.01,
+        help='percentage of data to warmup on (.01 = 1% of all '
+        'training iters). Default 0.01')
+    group.add_argument(
+        '--switch-linear',
+        action='store_true',
+        help='Switch to linear decay for cosine decay')
+    # model checkpointing
+    group.add_argument(
+        '--save',
+        type=str,
+        default=None,
+        help='Output directory to save checkpoints to.')
+    group.add_argument('--new-save-directory', action='store_true')
+    group.add_argument(
+        '--save-epoch',
+        type=int,
+        default=1,
+        help='number of epochs between saves')
+    group.add_argument(
+        '--save-interval',
+        type=int,
+        default=5000,
+        help='number of iterations between saves')
+    group.add_argument(
+        '--no-save-optim',
+        action='store_true',
+        help='Do not save current optimizer.')
+    group.add_argument(
+        '--no-save-rng',
+        action='store_true',
+        help='Do not save current rng state.')
+    group.add_argument(
+        '--load',
+        type=str,
+        default=None,
+        help='Path to a directory containing a model checkpoint.')
+    group.add_argument(
+        '--no-load-optim',
+        action='store_true',
+        help='Do not load optimizer when loading checkpoint.')
+    group.add_argument(
+        '--no-load-rng',
+        action='store_true',
+        help='Do not load rng state when loading checkpoint.')
+    group.add_argument(
+        '--no-load-lr-scheduler',
+        action='store_true',
+        help='Do not load lr scheduler when loading checkpoint.')
+    group.add_argument(
+        '--no-deepspeed-load',
+        action='store_true',
+        help='Not use deepspeed when loading checkpoint')
+    group.add_argument(
+        '--finetune',
+        action='store_true',
+        help='Load model for finetuning. Do not load optimizer '
+        'or rng state from checkpoint and set iteration to 0. '
+        'Assumed when loading a release checkpoint.')
+    group.add_argument(
+        '--mode',
+        type=str,
+        default='pretrain',
+        choices=['pretrain', 'finetune', 'inference'],
+        help=
+        'what type of task to use, will influence auto-warmup, exp name, iteration'
+    )
+    group.add_argument(
+        '--resume-dataloader',
+        action='store_true',
+        help='Resume the dataloader when resuming training. '
+        'Does not apply to tfrecords dataloader, try resuming'
+        'with a different seed in this case.')
+    # distributed training args
+    group.add_argument(
+        '--distributed-backend',
+        default='nccl',
+        help='which backend to use for distributed '
+        'training. One of [gloo, nccl]')
+    group.add_argument(
+        '--DDP-impl',
+        default='torch',
+        choices=['local', 'torch', 'none'],
+        help='which DistributedDataParallel implementation to use.')
+    group.add_argument(
+        '--local_rank',
+        type=int,
+        default=None,
+        help='local rank passed from distributed launcher')
+
+    return parser
+
+
+def add_evaluation_args(parser):
+    """Evaluation arguments."""
+
+    group = parser.add_argument_group('validation',
+                                      'validation configurations')
+
+    group.add_argument(
+        '--eval-batch-size',
+        type=int,
+        default=None,
+        help='Data Loader batch size for evaluation datasets.'
+        'Defaults to `--batch-size`')
+    group.add_argument(
+        '--eval-iters',
+        type=int,
+        default=100,
+        help='number of iterations to run for evaluation'
+        'validation/test for')
+    group.add_argument(
+        '--eval-interval',
+        type=int,
+        default=1000,
+        help='interval between running evaluation on validation set')
+    group.add_argument(
+        '--eval-epoch',
+        type=int,
+        default=1,
+        help='epoch between running evaluation on validation set')
+    group.add_argument(
+        '--eval-seq-length',
+        type=int,
+        default=None,
+        help='Maximum sequence length to process for '
+        'evaluation. Defaults to `--seq-length`')
+    group.add_argument(
+        '--eval-max-preds-per-seq',
+        type=int,
+        default=None,
+        help='Maximum number of predictions to use for '
+        'evaluation. Defaults to '
+        'math.ceil(`--eval-seq-length`*.15/10)*10')
+    group.add_argument(
+        '--overlapping-eval',
+        type=int,
+        default=32,
+        help='sliding window for overlapping eval ')
+    # group.add_argument('--cloze-eval', action='store_true',
+    #                    help='Evaluation dataset from `--valid-data` is a cloze task')
+    group.add_argument(
+        '--eval-hf',
+        action='store_true',
+        help='perform evaluation with huggingface openai model.'
+        'use `--load` to specify weights path to be loaded')
+    group.add_argument(
+        '--load-openai',
+        action='store_true',
+        help='load openai weights into our model. Use `--load` '
+        'to specify weights path to be loaded')
+
+    return parser
+
+
+def add_text_generate_args(parser):
+    """Text generate arguments."""
+
+    group = parser.add_argument_group('Text generation', 'configurations')
+    group.add_argument('--temperature', type=float, default=1.0)
+    group.add_argument('--top_p', type=float, default=0.0)
+    group.add_argument('--top_k', type=int, default=0)
+    group.add_argument('--num-beams', type=int, default=1)
+    group.add_argument(
+        '--out-seq-length', type=int, default=256)  # yuandong512
+    group.add_argument('--length-penalty', type=float, default=0.0)
+    group.add_argument('--no-repeat-ngram-size', type=int, default=0)
+    group.add_argument('--min-tgt-length', type=int, default=0)
+    group.add_argument('--select-topk', action='store_true')
+    group.add_argument('--blank-maskratio', type=float, default=0.1)
+    group.add_argument(
+        '--input-source',
+        type=str,
+        default='interactive',
+        help='what input mode to use, interactive or path')
+    group.add_argument(
+        '--output-path',
+        type=str,
+        default='./samples',
+        help='path to place the generated samples')
+    group.add_argument(
+        '--with-id',
+        action='store_true',
+        help='If each line is prepended with an id.')
+    group.add_argument('--max-inference-batch-size', type=int, default=12)
+    group.add_argument('--device', type=int, default=-1)
+    return parser
+
+
+def add_data_args(parser):
+    """Train/valid/test data arguments."""
+
+    group = parser.add_argument_group('data', 'data configurations')
+
+    group.add_argument(
+        '--model-parallel-size',
+        type=int,
+        default=1,
+        help='size of the model parallel.')
+    group.add_argument(
+        '--shuffle',
+        action='store_true',
+        help='Shuffle data. Shuffling is deterministic '
+        'based on seed and current epoch.')
+    group.add_argument('--filter-english', action='store_true')
+    group.add_argument(
+        '--train-data',
+        nargs='+',
+        default=None,
+        help='Whitespace separated filenames or corpora names '
+        'for training.')
+    group.add_argument(
+        '--valid-data',
+        nargs='*',
+        default=None,
+        help="""Filename for validation data.""")
+    group.add_argument(
+        '--test-data',
+        nargs='*',
+        default=None,
+        help="""Filename for testing""")
+    group.add_argument(
+        '--data-dir',
+        type=str,
+        default=None,
+        help='The data path to all the data files')
+    group.add_argument(
+        '--use-npy-data-loader',
+        action='store_true',
+        help='Use the numpy data loader. If set, then'
+        'train-data-path, val-data-path, and test-data-path'
+        'should also be provided.')
+    group.add_argument(
+        '--train-data-path',
+        type=str,
+        default='',
+        help='path to the training data')
+    group.add_argument(
+        '--val-data-path',
+        type=str,
+        default='',
+        help='path to the validation data')
+    group.add_argument(
+        '--test-data-path', type=str, default='', help='path to the test data')
+    group.add_argument(
+        '--input-data-sizes-file',
+        type=str,
+        default='sizes.txt',
+        help='the filename containing all the shards sizes')
+
+    group.add_argument(
+        '--delim', default=',', help='delimiter used to parse csv data files')
+    group.add_argument(
+        '--text-key',
+        default='sentence',
+        help='key to use to extract text from json/csv')
+    group.add_argument(
+        '--eval-text-key',
+        default=None,
+        help='key to use to extract text from '
+        'json/csv evaluation datasets')
+    group.add_argument(
+        '--split',
+        default='1000,1,1',
+        help='comma-separated list of proportions for training,'
+        ' validation, and test split')
+
+    group.add_argument(
+        '--no-lazy-loader',
+        action='store_true',
+        help='whether to lazy read the data set')
+    group.add_argument('--half-lazy-loader', action='store_true')
+    group.add_argument(
+        '--loader-scatter',
+        type=int,
+        default=None,
+        help='Number of scatters to use for dataloaders')
+    group.add_argument(
+        '--lazy-loader',
+        action='store_true',
+        help='whether to lazy read the data set')
+    group.add_argument(
+        '--loose-json',
+        action='store_true',
+        help='Use loose json (one json-formatted string per '
+        'newline), instead of tight json (data file is one '
+        'json string)')
+    group.add_argument(
+        '--presplit-sentences',
+        action='store_true',
+        help='Dataset content consists of documents where '
+        'each document consists of newline separated sentences')
+    group.add_argument(
+        '--num-workers',
+        type=int,
+        default=2,
+        help="""Number of workers to use for dataloading""")
+
+    group.add_argument(
+        '--block-size',
+        type=int,
+        default=10000,
+        help="""Size of block to reduce memory in dataset""")
+
+    return parser
+
+
+def add_generation_api_args(parser):
+    """generation api arguments"""
+
+    group = parser.add_argument_group('api', 'api configurations')
+
+    group.add_argument('--img_folder_path', default='image/')
+    group.add_argument('--input_folder_path', default='input/')
+    group.add_argument('--input_rec_path', default='input/')
+    group.add_argument('--check_mode', default='code')
+    group.add_argument('--time_interval', default=10)
+
+    return parser
+
+
+def add_tokenization_args(parser):
+    """sparse attention arguments."""
+
+    group = parser.add_argument_group('Tokenization',
+                                      'tokenization configurations')
+    group.add_argument(
+        '--tokenizer-model-type',
+        type=str,
+        default=None,
+        help="Model type to use for sentencepiece tokenization \
+                       (one of ['bpe', 'char', 'unigram', 'word']) or \
+                       bert vocab to use for BertWordPieceTokenizer (one of \
+                       ['bert-large-uncased', 'bert-large-cased', etc.])")
+    group.add_argument(
+        '--tokenizer-path',
+        type=str,
+        default='tokenizer.model',
+        help='path used to save/load sentencepiece tokenization '
+        'models')
+    group.add_argument(
+        '--img-tokenizer-path',
+        type=str,
+        default=None,
+        help='The checkpoint file path of image tokenizer.')
+    group.add_argument(
+        '--tokenizer-type',
+        type=str,
+        default=
+        'ChineseSPTokenizer',  # BertWordPieceTokenizer->ChineseSPTokenizer
+        choices=[
+            'CharacterLevelTokenizer', 'SentencePieceTokenizer',
+            'BertWordPieceTokenizer', 'GPT2BPETokenizer', 'ChineseSPTokenizer',
+            'glm_ChineseSPTokenizer'
+        ],
+        help='what type of tokenizer to use')
+    group.add_argument('--fix-command-token', action='store_true')
+    group.add_argument('--not-pre-tokenize', action='store_true')
+    group.add_argument(
+        '--cache-dir',
+        default='cache',
+        type=str,  # None->'cache'
+        help='Where to store pre-trained BERT downloads')
+    group.add_argument(
+        '--use-tfrecords',
+        action='store_true',
+        help='load `--train-data`, `--valid-data`, '
+        '`--test-data` from BERT tf records instead of '
+        'normal data pipeline')
+    group.add_argument(
+        '--seq-length',
+        type=int,
+        default=512,
+        help='Maximum sequence length to process')
+    group.add_argument(
+        '--mem-length',
+        type=int,
+        default=0,
+        help='The memory length to preserve')
+    group.add_argument(
+        '--max-preds-per-seq',
+        type=int,
+        default=None,
+        help='Maximum number of predictions to use per sequence.'
+        'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
+        'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
+    group.add_argument('--non-sentence-start', type=float, default=0.0)
+    group.add_argument(
+        '--sample-one-document',
+        action='store_true',
+        help='only sample one document in one sample')
+    group.add_argument(
+        '--load-splits',
+        type=str,
+        default=None,
+        help='The path to load split indices from')
+    group.add_argument(
+        '--save-splits',
+        type=str,
+        default=None,
+        help='The path to save split indices to')
+    group.add_argument(
+        '--save-test-data',
+        type=str,
+        default=None,
+        help='The path to save the test data')
+    group.add_argument(
+        '--multi-task-data',
+        nargs='*',
+        default=None,
+        help='Downsteam task names for multi-task pre-training')
+    group.add_argument(
+        '--multi-task-ratio',
+        type=float,
+        default=0.0,
+        help='Ratio for multi-task pre-training')
+    group.add_argument('--multi-seq-length', type=int, default=None)
+    group.add_argument('--multi-batch-size', type=int, default=None)
+    return parser
+
+
+def add_glm_args(parser):
+    """Arguments for GLM"""
+    group = parser.add_argument_group('GLM', 'GLM Configurations')
+    group.add_argument(
+        '--block-lm',
+        action='store_true',
+        help='whether use the BlockLM pre-training')
+    group.add_argument(
+        '--masked-lm',
+        action='store_true',
+        help='whether to use the mlm objective')
+    group.add_argument('--bert-prob', type=float, default=0.5)
+    group.add_argument('--gpt-infill-prob', type=float, default=0.5)
+    group.add_argument('--gpt-min-ratio', type=float, default=0.5)
+    group.add_argument('--gap-sentence-prob', type=float, default=0.0)
+    group.add_argument('--gap-sentence-ratio', type=float, default=0.15)
+    group.add_argument('--avg-block-length', type=int, default=3)
+    group.add_argument('--short-seq-prob', type=float, default=0.0)
+    group.add_argument('--single-span-prob', type=float, default=0.0)
+    group.add_argument(
+        '--task-mask',
+        action='store_true',
+        help='Use different mask for generation and blank filling')
+    group.add_argument(
+        '--no-shuffle-block',
+        action='store_true',
+        help='not shuffle the blocks when filling the blank')
+    group.add_argument(
+        '--no-block-position',
+        action='store_true',
+        help='Use (rough) absolute positions instead of block positions')
+    group.add_argument(
+        '--sentinel-token',
+        action='store_true',
+        help='Use sentinel (mask) tokens to replace 2d position encoding')
+    group.add_argument('--block-mask-prob', type=float, default=0.0)
+    group.add_argument('--context-mask-ratio', type=float, default=0.0)
+    group.add_argument(
+        '--random-position',
+        action='store_true',
+        help='Use random start position to cover all the position embeddings')
+    group.add_argument(
+        '--old-checkpoint',
+        action='store_true',
+        help='Loading the checkpoint from old libraray')
+
+    group.add_argument(
+        '--sampling-strategy',
+        type=str,
+        default='BaseStrategy',
+        help='type name of sampling strategy')
+    return parser
+
+
+def add_finetune_config_args(parser):
+    group = parser.add_argument_group('finetune', 'finetune configurations')
+    group.add_argument('--task', type=str, help='Task name.')
+    group.add_argument(
+        '--load-pretrained',
+        type=str,
+        help='Load pretrained model',
+        default=
+        '/root/yuandong_use/GR/glm_finetuned_model/blocklm-10B-kbqa_08-18-16-45'
+    )
+    # None->/root/yuandong_use/GR/glm_finetuned_model/blocklm-10B-kbqa_08-18-16-45
+    group.add_argument(
+        '--pool-token',
+        type=str,
+        choices=['start', 'pad', 'cls'],
+        help='The token to pool the sequence representation',
+        default='cls')
+    group.add_argument(
+        '--cloze-eval',
+        action='store_true',
+        help='Evaluation dataset with cloze task')
+    group.add_argument(
+        '--multi-token',
+        action='store_true',
+        help='Use multi token for cloze evaluation')
+    group.add_argument(
+        '--segment-length',
+        type=int,
+        default=0,
+        help='The maximum segment length for cloze evaluation')
+    group.add_argument(
+        '--loss-func',
+        type=str,
+        choices=['cross_entropy', 'hinge', 'generative', 'mix'],
+        default='cross_entropy')
+    group.add_argument('--block-lm-ratio', type=float, default=0.0)
+    group.add_argument(
+        '--adapet',
+        action='store_true',
+        help='Use the decoupled cross entropy loss in AdaPET')
+    group.add_argument('--pattern-id', type=int, default=0)
+    group.add_argument(
+        '--fast-decode',
+        action='store_true',
+        help=
+        'Fast decode for multi-token cloze. Can only be used without checkpoint activation.'
+    )
+    group.add_argument('--few-superglue', action='store_true')
+    group.add_argument(
+        '--eval-valid',
+        action='store_true',
+        help='Whether evaluate on the valid set')
+    group.add_argument('--validation-metric', type=str, default=None)
+    group.add_argument(
+        '--unidirectional',
+        action='store_true',
+        help='Use the left to right language model')
+    group.add_argument('--src-seq-length', type=int, default=None)
+    group.add_argument('--tgt-seq-length', type=int, default=None)
+    group.add_argument('--adam-beta1', type=float, default=0.9)
+    group.add_argument('--adam-beta2', type=float, default=0.999)
+    group.add_argument('--adam-eps', type=float, default=1e-8)
+    group.add_argument(
+        '--optimizer', type=str, choices=['adam', 'adafactor'], default='adam')
+    group.add_argument('--wsc-negative', action='store_true')
+    group.add_argument('--overwrite', action='store_true')
+    group.add_argument('--no-validation', action='store_true')
+    # Continuous prompt arguments
+    group.add_argument(
+        '--continuous-prompt',
+        action='store_true',
+        help='Use continuous prompt for PET')
+    group.add_argument('--num-prompt-tokens', type=int, default=0)
+    group.add_argument(
+        '--prompt-func', default='lstm', choices=['lstm', 'mlp', 'none'])
+    group.add_argument(
+        '--freeze-transformer', action='store_true', default=False)
+    group.add_argument('--tune-prefix-layers', type=int, default=None)
+    group.add_argument('--prefix-prompt', type=int, default=0)
+    group.add_argument('--prompt-init', action='store_true', default=False)
+    return parser
+
+
+def get_args():
+    """Parse all the args."""
+
+    parser = argparse.ArgumentParser(description='PyTorch BERT Model')
+    parser = add_model_config_args(parser)
+    parser = add_fp16_config_args(parser)
+    parser = add_training_args(parser)
+    parser = add_evaluation_args(parser)
+    parser = add_data_args(parser)
+    parser = add_tokenization_args(parser)
+    parser = add_text_generate_args(parser)
+    parser = add_generation_api_args(parser)
+    parser = add_glm_args(parser)
+    parser = add_finetune_config_args(parser)
+
+    # Include DeepSpeed configuration arguments
+    parser = deepspeed.add_config_arguments(parser)
+
+    args, unknown = parser.parse_known_args()
+    if not args.train_data and not args.train_data_path:
+        print('WARNING: No training data specified')
+
+    args.cuda = torch.cuda.is_available()
+
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv('WORLD_SIZE', '1'))
+    if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
+        mpi_define_env(args)
+    if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
+        # We are using (OpenMPI) mpirun for launching distributed data parallel processes
+        local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
+        local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+
+        # Possibly running with Slurm
+        num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
+        nodeid = int(os.getenv('SLURM_NODEID', '0'))
+
+        args.local_rank = local_rank
+        args.rank = nodeid * local_size + local_rank
+        args.world_size = num_nodes * local_size
+
+    args.model_parallel_size = min(args.model_parallel_size, args.world_size)
+    if args.rank == 0:
+        print('using world size: {} and model-parallel size: {} '.format(
+            args.world_size, args.model_parallel_size))
+
+    args.dynamic_loss_scale = False
+    if args.loss_scale is None:
+        args.dynamic_loss_scale = True
+        if args.rank == 0:
+            print(' > using dynamic loss scaling')
+
+    # The args fp32_* or fp16_* meant to be active when the
+    # args fp16 is set. So the default behaviour should all
+    # be false.
+    if not args.fp16:
+        args.fp32_embedding = False
+        args.fp32_tokentypes = False
+        args.fp32_layernorm = False
+
+    if hasattr(args, 'deepspeed'
+               ) and args.deepspeed and args.deepspeed_config is not None:
+        with open(args.deepspeed_config) as file:
+            deepspeed_config = json.load(file)
+        if 'fp16' in deepspeed_config and deepspeed_config['fp16']['enabled']:
+            args.fp16 = True
+        else:
+            args.fp16 = False
+        if args.checkpoint_activations:
+            args.deepspeed_activation_checkpointing = True
+        if 'train_micro_batch_size_per_gpu' in deepspeed_config:
+            args.batch_size = deepspeed_config[
+                'train_micro_batch_size_per_gpu']
+        if 'gradient_accumulation_steps' in deepspeed_config:
+            args.gradient_accumulation_steps = deepspeed_config[
+                'gradient_accumulation_steps']
+        else:
+            args.gradient_accumulation_steps = None
+        if 'optimizer' in deepspeed_config:
+            optimizer_params_config = deepspeed_config['optimizer'].get(
+                'params', {})
+            args.lr = optimizer_params_config.get('lr', args.lr)
+            args.weight_decay = optimizer_params_config.get(
+                'weight_decay', args.weight_decay)
+    return args
+
+
+def mpi_define_env(args):
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    world_size = comm.Get_size()
+
+    master_addr = None
+    if rank == 0:
+        master_addr = get_hostname()
+    master_addr = comm.bcast(master_addr, root=0)
+
+    # Determine local rank by assuming hostnames are unique
+    proc_name = MPI.Get_processor_name()
+    all_procs = comm.allgather(proc_name)
+    local_rank = sum([i == proc_name for i in all_procs[:rank]])
+
+    os.environ['RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    args.local_rank = local_rank
+    args.world_size = world_size
+    args.rank = rank
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ[
+        'MASTER_PORT'] = '29500'  # TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
+
+    print(
+        'Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}'
+        .format(os.environ['RANK'], args.local_rank, os.environ['WORLD_SIZE'],
+                os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']))
--- a/modelscope/models/nlp/txl_poem/com_utils/http_utils.py
+++ b/modelscope/models/nlp/txl_poem/com_utils/http_utils.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022 Zhipu.AI
+import csv
+import traceback
+from io import StringIO
+from urllib import parse
+
+from flask import Response, jsonify, request, send_file
+
+
+class APIException(Exception):
+
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class IllegalParamException(APIException):
+
+    def __init__(self, error):
+        self.error = error
+        super(IllegalParamException, self).__init__(error)
+
+
+class InputTooLongException(APIException):
+
+    def __init__(self, message, payload=None):
+        self.payload = payload
+        super().__init__(message)
+
+
+class CanNotReturnException(APIException):
+
+    def __init__(self, message, payload=None):
+        self.payload = payload
+        super().__init__(message)
+
+
+class MongoDBException(APIException):
+
+    def __init__(self, error):
+        self.error = error
+        super(MongoDBException, self).__init__(error)
+
+
+class MissParameterException(APIException):
+
+    def __init__(self, error):
+        self.error = error
+        super(MissParameterException, self).__init__(error)
+
+
+class HttpUtil:
+
+    @staticmethod
+    def http_response(status=0, message='success', data=None, total=False):
+        # if status and not isinstance(data, APIException):
+        #     sm.send_content(request.url_rule, traceback.format_exc(), request.data)
+        if isinstance(data, Exception):
+            data = str(data)
+        r = {'status': status, 'message': message, 'result': data or []}
+        if total and type(data) == list:
+            if type(total) == int:
+                r['total'] = total
+            else:
+                r['total'] = len(data)
+        return jsonify(r)
+
+    @staticmethod
+    def check_param(
+            name,
+            request,  # noqa
+            method=0,
+            param_type=None,
+            default=None,
+            required=True):
+        if method == 0:
+            param = request.args.get(name)
+        else:
+            try:
+                param = request.json.get(name)
+            except Exception as e:  # noqa
+                raise IllegalParamException('data format json')
+
+        if param is None:
+            if not required:
+                return default
+            raise IllegalParamException('param {} is required'.format(name))
+        else:
+            if param_type and type(param) != param_type:
+                try:
+                    return param_type(param)
+                except ValueError:
+                    raise IllegalParamException(
+                        'param {}: type wrong, not {}'.format(
+                            name, param_type))
+            else:
+                return param
+
+    @staticmethod
+    def csv_file_response(data, filename):
+        response = Response(HttpUtil.get_csv_stream(data), mimetype='text/csv')
+        response.headers[
+            'Content-Disposition'] = f'attachment; filename={parse.quote(filename)}.csv'
+        return response
+
+    @staticmethod
+    def get_csv_stream(data):
+        line = StringIO()
+        csv_writer = csv.writer(line)
+        csv_writer.writerow(['name', 'org', 'position', 'email', 'phone'])
+        for p in data:
+            csv_writer.writerow(
+                [p['name'], p['aff'], p['position'], p['email'], p['phone']])
+        res = line.getvalue()
+        line.close()
+        return res
--- a/modelscope/models/nlp/txl_poem/fastpoem.py
+++ b/modelscope/models/nlp/txl_poem/fastpoem.py
@@ -0,0 +1,981 @@
+# Copyright (c) 2022 Zhipu.AI
+"""Sample Generate GPT2"""
+
+import argparse
+import copy
+import os
+import random
+import time
+from datetime import datetime
+
+import deepspeed
+import jsonlines
+import numpy as np
+import torch
+import torch.nn.functional as F
+from pypinyin import FINALS, FINALS_TONE, TONE3, pinyin
+
+from .arguments import get_args
+from .com_utils.http_utils import (CanNotReturnException,
+                                   InputTooLongException,
+                                   MissParameterException)
+from .gpt2 import mpu
+from .gpt2.configure_data import configure_data
+from .gpt2.data_utils import make_tokenizer
+from .gpt2.fp16 import FP16_Module
+from .gpt2.model import DistributedDataParallel as DDP
+from .gpt2.model import GPT2Model
+from .gpt2.utils import (Timers, get_checkpoint_iteration, load_checkpoint,
+                         print_rank_0)
+
+open_old_pronounce = 1
+
+
+def get_model(args):
+    """Build the model."""
+
+    print_rank_0('building GPT2 model ...')
+    model = GPT2Model(
+        num_layers=args.num_layers,
+        vocab_size=args.vocab_size,
+        hidden_size=args.hidden_size,
+        num_attention_heads=args.num_attention_heads,
+        embedding_dropout_prob=args.hidden_dropout,
+        attention_dropout_prob=args.attention_dropout,
+        output_dropout_prob=args.hidden_dropout,
+        max_sequence_length=args.max_position_embeddings,
+        max_memory_length=args.mem_length,
+        checkpoint_activations=args.checkpoint_activations,
+        checkpoint_num_layers=args.checkpoint_num_layers,
+        parallel_output=True,
+        relative_encoding=args.transformer_xl)
+
+    if mpu.get_data_parallel_rank() == 0:
+        print(
+            ' > number of parameters on model parallel rank {}: {}'.format(
+                mpu.get_model_parallel_rank(),
+                sum([p.nelement() for p in model.parameters()])),
+            flush=True)
+
+    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
+    if hasattr(args, 'deepspeed') and args.deepspeed and args.fp16:
+        model.half()
+
+    # GPU allocation.
+    model.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if args.fp16:
+        model = FP16_Module(model)
+
+    # Wrap model for distributed training.
+    if not args.deepspeed:
+        if USE_TORCH_DDP:
+            i = torch.cuda.current_device()
+            model = DDP(
+                model,
+                device_ids=[i],
+                output_device=i,
+                process_group=mpu.get_data_parallel_group())
+        else:
+            model = DDP(model)
+
+    return model
+
+
+def get_masks_and_position_ids(data,
+                               eod_token,
+                               reset_position_ids,
+                               reset_attention_mask,
+                               loss_mask=None,
+                               attention_mask=None,
+                               transformer_xl=False,
+                               mem_length=None):
+    # Extract batch size and sequence length.
+    batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if transformer_xl:
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (1, seq_length, seq_length + mem_length), device=data.device)
+        attention_mask = torch.tril(
+            torch.triu(attention_mask, 1 - seq_length + mem_length),
+            mem_length)
+    else:
+        if reset_attention_mask:
+            att_mask_batch = batch_size
+        else:
+            att_mask_batch = 1
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (att_mask_batch, seq_length, seq_length), device=data.device)
+        attention_mask = torch.tril(attention_mask)
+    attention_mask = attention_mask.unsqueeze(1)
+
+    # Loss mask.
+    if loss_mask is None:
+        loss_mask = torch.ones(
+            data.size(), dtype=torch.float, device=data.device)
+    loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(
+        seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    if not transformer_xl:
+        # We need to clone as the ids will be modifed based on batch index.
+        if reset_position_ids:
+            position_ids = position_ids.clone()
+
+        if reset_position_ids or reset_attention_mask:
+            # Loop through the batches:
+            for b in range(batch_size):
+
+                # Find indecies where EOD token is.
+                eod_index = position_ids[b, data[b] == eod_token]
+                # Detach indecies from positions if going to modify positions.
+                if reset_position_ids:
+                    eod_index = eod_index.clone()
+
+                # Loop through EOD indecies:
+                prev_index = 0
+                for j in range(eod_index.size()[0]):
+                    i = eod_index[j]
+                    # Mask attention loss.
+                    if reset_attention_mask:
+                        attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                    # Reset positions.
+                    if reset_position_ids:
+                        position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                        prev_index = i + 1
+
+    return attention_mask, loss_mask, position_ids
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(args):
+    """Initialize torch.distributed."""
+
+    # Manually set the device ids.
+    device = args.rank % torch.cuda.device_count()
+    if args.local_rank is not None:
+        device = args.local_rank
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+    # master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_ip = os.getenv('MASTER_ADDR', '127.0.0.1')
+    master_port = os.getenv('MASTER_PORT', '6001')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=args.distributed_backend,
+        world_size=args.world_size,
+        rank=args.rank,
+        init_method=init_method)
+
+    # Set the model-parallel / data-parallel communicators.
+    mpu.initialize_model_parallel(args.model_parallel_size)
+
+    # Optional DeepSpeed Activation Checkpointing Features
+    #
+    if hasattr(
+            args, 'deepspeed'
+    ) and args.deepspeed and args.deepspeed_activation_checkpointing:
+        set_deepspeed_activation_checkpointing(args)
+
+
+def setup_model(args):
+    """Setup model and optimizer."""
+
+    model = get_model(args)
+
+    # if args.deepspeed:
+    #     print_rank_0("DeepSpeed is enabled.")
+    #
+    #     model, _, _, _ = deepspeed.initialize(
+    #         model=model,
+    #         model_parameters=model.parameters(),
+    #         args=args,
+    #         mpu=mpu,
+    #         dist_init_required=False
+    #     )
+    if args.load is not None:
+        if args.deepspeed:
+            iteration, release, success = get_checkpoint_iteration(args)
+            print(args.load)
+            path = os.path.join(args.load, 'mp_rank_00_model_states.pt')
+            checkpoint = torch.load(path, map_location=torch.device('cpu'))
+            model.load_state_dict(checkpoint['module'])
+        else:
+            _ = load_checkpoint(
+                model, None, None, args, load_optimizer_states=False)
+    # if args.deepspeed:
+    #     model = model.module
+
+    return model
+
+
+def get_batch(context_tokens, device, args):
+    tokens = context_tokens
+    tokens = tokens.view(args.batch_size, -1).contiguous()
+    tokens = tokens.to(device)
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_masks_and_position_ids(
+        tokens,
+        args.eod_token,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        transformer_xl=args.transformer_xl,
+        mem_length=args.mem_length)
+
+    return tokens, attention_mask, position_ids
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    # This function has been mostly taken from huggingface conversational ai code at
+    # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                  None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # convert to 1D
+        logits = logits.view(logits.size()[1]).contiguous()
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(
+            F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+            ..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+        # going back to 2D
+        logits = logits.view(1, -1).contiguous()
+
+    return logits
+
+
+rus = set([
+    '八', '搭', '塌', '邋', '插', '察', '杀', '煞', '夹', '俠', '瞎', '辖', '狹', '匣', '黠',
+    '鸭', '押', '压', '刷', '刮', '滑', '猾', '挖', '蜇', '舌', '鸽', '割', '胳', '搁', '瞌',
+    '喝', '合', '盒', '盍', '曷', '貉', '涸', '劾', '核', '钵', '剝', '泼', '摸', '脱', '托',
+    '捋', '撮', '缩', '豁', '活', '切', '噎', '汁', '织', '隻', '掷', '湿', '虱', '失', '十',
+    '什', '拾', '实', '食', '蝕', '识', '石', '劈', '霹', '滴', '踢', '剔', '屐', '积', '激',
+    '击', '漆', '吸', '息', '媳', '昔', '席', '锡', '檄', '觋', '揖', '一', '壹', '扑', '匍',
+    '仆', '弗', '紱', '拂', '福', '蝠', '幅', '辐', '服', '伏', '茯', '督', '突', '秃', '俗',
+    '出', '蜀', '窟', '哭', '忽', '惚', '斛', '鹄', '屋', '屈', '诎', '曲', '戌', '拍', '塞',
+    '摘', '拆', '黑', '勺', '芍', '嚼', '粥', '妯', '熟', '白', '柏', '伯', '薄', '剥', '摸',
+    '粥', '轴', '舳', '妯', '熟', '角', '削', '学'
+])
+ss = set([
+    'de', 'te', 'le', 'ze', 'ce', 'se', 'fa', 'fo', 'dei', 'zei', 'gei', 'hei',
+    'sei', 'bie', 'pie', 'mie', 'die', 'tie', 'nie', 'lie', 'kuo', 'zhuo',
+    'chuo', 'shuo', 'ruo'
+])
+
+
+def checkpz(st, wd):
+
+    if not (st[-1] in ['1', '2', '3', '4']):
+        return 0
+
+    if open_old_pronounce == 1:
+        if wd in rus:
+            return 2
+        if wd in ['嗟', '瘸', '靴', '爹']:
+            return 1
+        if st[:-1] in ss:
+            return 2
+
+        if (st[-1] == '2' and st[0] in ['b', 'd', 'g', 'j', 'z']):
+            return 2
+        if 'ue' in st:
+            return 2
+
+    if st[-1] in ['1', '2']:
+        return 1
+
+    return 2
+
+
+# inner rhy, must obey
+def checkrhyself(sentence):
+    if len(sentence) == 0:
+        return 0
+    st = sentence
+    fullst = False
+    while (len(st) > 0 and st[-1] in [',', '。', '，', '?', '？', '!', '！']):
+        st = st[:-1]
+        fullst = True
+
+    l1 = pinyin(st, style=TONE3)
+    if len(l1) < len(st):
+        return 1
+    for i in l1:
+        if len(i[0]) < 2:
+            return 1
+    if len(st) <= 3:
+        return 2
+
+    pz1 = checkpz(l1[1][0], sentence[1])
+
+    if len(st) >= 4:
+        pz2 = checkpz(l1[3][0], sentence[3])
+        if pz2 + pz1 != 3:
+            return 1
+    if len(st) >= 6:
+        pz3 = checkpz(l1[5][0], sentence[5])
+        if pz2 + pz3 != 3:
+            return 1
+    if fullst:
+        if len(sentence) < 6:
+            return 1
+        pz11 = checkpz(l1[-3][0], st[-3])
+        pz12 = checkpz(l1[-2][0], st[-2])
+        pz13 = checkpz(l1[-1][0], st[-1])
+        if (pz11 == pz12) and (pz12 == pz13):
+            return 1
+
+    return 2
+
+
+def checkrhy(sentence, last, imp, req=0):
+
+    while (len(sentence) > 0
+           and (sentence[-1] in [',', '。', '，', '?', '？', '!', '！'])):
+        sentence = sentence[:-1]
+    if len(sentence) == 0:
+        return 0
+
+    while last[-1] in [',', '。', '，', '?', '？', '!', '！']:
+        last = last[:-1]
+    l1 = pinyin(sentence, style=TONE3)
+    l2 = pinyin(last, style=TONE3)
+    disobey = 0
+    if len(l1) != len(sentence):
+        return -1000
+    for i in range(len(sentence)):
+        if (i < len(l1)) and (i < len(l2)):
+            st1 = checkpz(l1[i][0], sentence[i])
+
+            sr1 = checkpz(l2[i][0], last[i])
+            if (req == 1 and i % 2 == 1):
+                st1 = 3 - st1
+
+            if st1 + sr1 != 3:
+                if req == 0:
+                    disobey += 0.35
+                if i % 2 == 1:
+                    disobey += 0.35
+                    if req == 1:
+                        disobey += 0.2
+                if i == len(l2) - 1:
+                    disobey += 0.65
+                    if req == 1:
+                        disobey += 0.35
+
+    disobey *= imp
+    disobey = -5 * disobey / len(l2)
+    for i in range(len(l1)):
+        for j in range(i + 2, len(l1)):
+            if l1[i][0][:-1] == l1[j][0][:-1]:
+                disobey -= 7 / len(l1)
+    return disobey
+
+
+def checksentence(sentence,
+                  original_context,
+                  min_length,
+                  max_length,
+                  endnote,
+                  curvote=0,
+                  yayun=None):
+
+    if '<|end' in sentence:
+        return 1
+
+    if '的' in sentence:
+        return 1
+    if len(sentence) == 0:
+        return 1
+    if ((len(sentence) > max_length and not (sentence[-1] in endnote))
+            or len(sentence) == 0) or len(sentence) > max_length + 1:
+        return 1
+    if (sentence[-1] in endnote) and ((len(sentence) <= min_length) or  # noqa
+                                      (len(sentence) == 7)):  # noqa
+        return 1
+
+    if (sentence[-1] in endnote) and (sentence[:-1] in original_context):
+        return 1
+
+    mdisobey = 0  # noqa
+    illegal_notes = [
+        ' ', ':', '《', '》', '‘', '“', '-', '——', '⁇', '[', '【', '】', ']', '.',
+        '、', '(', '（', ')', '）', '·'
+    ]
+    if '。' in endnote:
+        illegal_notes.extend([',', '，'])
+    else:
+        illegal_notes.append('。')
+    for i in range(10):
+        illegal_notes.append(str(i))
+    for i in range(64, 123):
+        illegal_notes.append(chr(i))
+    for note in illegal_notes:
+        if note in sentence:
+            return 1
+    last = getlastsentence(original_context)
+    if min_length == max_length:
+        imp = 1
+        if (',' in last) or ('，' in last):
+            imp = 1.5
+
+        if curvote == 0:
+            rt = checkrhy(sentence, last, imp, req=1)
+        else:
+            rt = checkrhy(sentence, last, imp)
+        if rt < -0.75:
+            return 1
+
+    for i in range(len(sentence)):
+        if min_length == max_length:
+            if (i < len(last) - 1) and (sentence[i] == last[i]):
+                return 1
+
+        if i < len(sentence) - 1:
+            if sentence[i:i + 2] in original_context:
+                return 1
+            if sentence[i:i + 2] in sentence[:i]:
+                return 1
+
+    if checkrhyself(sentence) == 1:
+        return 1
+    cc = curvote
+    if yayun is None:
+        cc = 0
+    if (cc == 1 and len(sentence) >= max_length):
+
+        final1 = pinyin(sentence, style=FINALS)
+        if len(final1) < max_length:
+            return 1
+        final1 = final1[max_length - 1][0]
+        final2 = pinyin(yayun, style=FINALS)[-1][0]
+        group = [['a', 'ia', 'ua'], ['ai', 'uai', 'ei', 'ui', 'uei'],
+                 ['an', 'uan', 'ian'], ['ie', 'ue', 've'], ['ou', 'iu', 'iou'],
+                 ['ang', 'iang', 'uang'], ['ao', 'iao'], ['e', 'o', 'uo'],
+                 ['en', 'un', 'uen', 'ong', 'iong', 'in', 'ing', 'er']]
+        doc = 0
+        if final1 == final2:
+            doc = 1
+        for i in group:
+            if (final1 in i) and (final2 in i):
+                doc = 1
+        if doc == 0:
+            return 1
+
+    if (sentence[-1] in endnote):
+        return 0
+
+    return 2
+
+
+def generate_sentence(model,
+                      tokenizer,
+                      args,
+                      device,
+                      current_tokens,
+                      mems,
+                      endnote=[',', '，', '?', '？'],
+                      num_candidates=1,
+                      min_length=5,
+                      max_length=7,
+                      yayun=None):
+    model.eval()
+    with torch.no_grad():
+        mct_tree = []
+        if mems == []:
+            mems = []
+            tokens, attention_mask, position_ids = get_batch(
+                current_tokens, device, args)
+            logits, *rts = model(tokens, position_ids, attention_mask, *mems)
+        else:
+            tokens = current_tokens
+            index = len(tokens[0])
+            logits, *rts = model(
+                tokens[:, index - 1:index],
+                tokens.new_ones((1, 1)) * (index - 1),
+                tokens.new_ones(
+                    1,
+                    1,
+                    1,
+                    args.mem_length + 1,
+                    device=tokens.device,
+                    dtype=torch.float), *mems)
+
+        output_tokens_list = tokens.view(-1).contiguous()
+        original_context = tokenizer.DecodeIds(output_tokens_list.tolist())
+        context_length = len(tokens[0])
+        logits = logits[0, -1]
+        mct_tree.append([
+            logits, rts, tokens, -np.ones(len(logits)),
+            torch.ones(len(logits)).cuda(), 0
+        ])
+        final_result = []
+        nextid = 0
+        tries = 0
+        max_tries = num_candidates * 30
+        curvote = 1
+        if ',' in endnote:
+            curvote = 0
+        if ',' in endnote:
+            endid = 43359
+        else:
+            endid = 43361
+        dpcount = 0
+
+        tmp = args.temperature
+
+        while ((len(final_result) < num_candidates) and (tries < max_tries)
+               and (tries < 1000)):
+            currentid = nextid
+            tries += 1
+            while currentid != -1:
+                tc = torch.log(mct_tree[currentid][4])
+                tc = tc + F.relu(tc - 10) * 1000
+                logits = mct_tree[currentid][0].view(-1) - tc * 0.5
+                logits = logits[:50001]
+                log_probs = F.softmax(logits, dim=-1)
+
+                pr = torch.multinomial(log_probs, num_samples=1)[0]
+                prev = pr.item()
+                mct_tree[currentid][4][prev] += 1
+                lastid = currentid
+                currentid = int(mct_tree[currentid][3][prev])
+            # start from lastid & currentid
+
+            cqs = mct_tree[lastid][2]
+            tokens = torch.cat((cqs, pr.unsqueeze(0).view(1, 1)), dim=1)
+            output_tokens_list = tokens.view(-1).contiguous()
+            sentence = tokenizer.DecodeIds(
+                output_tokens_list[context_length:].tolist())
+            logit = mct_tree[lastid][0]
+            log_probs = F.softmax(logit, dim=-1)
+            log_pbs = torch.log(log_probs)
+            score = log_pbs[prev].item()
+            nextid = 0
+            ip = checksentence(
+                sentence,
+                original_context,
+                min_length,
+                max_length,
+                endnote,
+                curvote=curvote,
+                yayun=yayun)
+            for j in final_result:
+                if j[0] == sentence:
+                    ip = 1
+                if ('<|end' in sentence) and ('<|end' in j[0]):
+                    ip = 1
+
+            score = mct_tree[lastid][5] + score
+            if (ip == 1):
+                nextid = lastid
+                dpcount += 1
+                max_tries += 1
+                if (dpcount >= 50) or (dpcount >= 8
+                                       and len(sentence) < max_length):
+                    nextid = 0
+                    dpcount = 0
+                mct_tree[lastid][4][prev] = 100000
+                continue
+            dpcount = 0
+            if (ip == 0):
+                mct_tree[lastid][4][prev] = 100000
+                yay = yayun
+                if curvote == 1:
+                    yay = sentence[-2]
+
+                final_result.append([
+                    copy.deepcopy(sentence),
+                    copy.deepcopy(score),
+                    copy.deepcopy(tokens),
+                    copy.deepcopy(mct_tree[lastid][1]), yay
+                ])
+                continue
+
+            mct_tree[lastid][3][prev] = len(mct_tree)
+            tmp = args.temperature
+            if (len(sentence) >= 4
+                    or (len(sentence) == 3 and max_length == 5)):
+                tmp = tmp * 0.6
+            rts = mct_tree[lastid][1]
+            index = len(tokens[0])
+
+            logits, *rts = model(
+                tokens[:, index - 1:index],
+                tokens.new_ones((1, 1)) * (index - 1),
+                tokens.new_ones(
+                    1,
+                    1,
+                    1,
+                    args.mem_length + 1,
+                    device=tokens.device,
+                    dtype=torch.float), *rts)
+            logits = logits[0, -1] / tmp
+            if len(sentence) == max_length:
+                logits[endid] += 10
+            mct_tree.append([
+                logits, rts, tokens, -np.ones(len(logits)),
+                torch.ones(len(logits)).cuda(), score
+            ])
+            nextid = len(mct_tree) - 1
+        del mct_tree
+        torch.cuda.empty_cache()
+        res = {}
+        res['output_tokens_length'] = len(output_tokens_list)
+        res['result'] = final_result
+        return res
+
+
+def getlength(str):
+    w = str.replace('。', ',').replace('，', ',').replace('？', ',').replace(
+        '?', ',').replace(' ', ',').replace('！',
+                                            ',').replace('!', ',').replace(
+                                                ':', ',').replace(' ', '')
+    sp = w.split(',')
+
+    return len(sp[-2])
+
+
+def getlastsentence(str):
+    w = str.replace('。', ',').replace('，', ',').replace('？', ',').replace(
+        '?', ',').replace(' ', ',').replace('！',
+                                            ',').replace('!', ',').replace(
+                                                ':', ',').replace(' ', '')
+    sp = w.split(',')
+    fom = sp[-1]
+    if len(fom) == 0:
+        fom = sp[-2]
+    return fom + str[-1]
+
+
+def generate_string(model,
+                    tokenizer,
+                    args,
+                    device,
+                    title,
+                    author,
+                    desc=None,
+                    length=None,
+                    st=None,
+                    lycr=5,
+                    senlength=4):
+    lycr_str = ''
+    senlength_str = ''
+    if lycr == 5:
+        lycr_str = '诗体：五言'
+    else:
+        lycr_str = '诗体：七言'
+    if senlength == 4:
+        senlength_str = '格律：绝句'
+    else:
+        senlength_str = '格律：律诗'
+    input_str = title + ' 作者:' + author + ' 体裁:诗歌' + lycr_str + senlength_str + '题名:' + title + ' 正文: '  # noqa
+    if desc is not None:
+        input_str = title + ' 作者:' + author + ' 体裁:诗歌' + lycr_str + senlength_str + '描述:' + desc + ' 题名:' + title + ' 正文: '  # noqa
+    input_len = len(input_str)  # noqa
+    context_count = 0  # noqa
+    model.eval()
+    with torch.no_grad():
+        context_tokens = tokenizer.EncodeAsIds(input_str).tokenization
+        eo_tokens = tokenizer.EncodeAsIds('<|endoftext|>').tokenization
+        context_length = len(context_tokens)
+        if context_length >= args.seq_length:
+            res = {}
+            res['prompt_token_num'] = 0
+            res['completion_token_num'] = 0
+            res['text'] = ''
+            res['errmsg'] = 'the text you entered is too long, please reduce the number of characters'
+            raise InputTooLongException(
+                'the text you entered is too long, please reduce the number of characters',
+                res)
+
+        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+        eo_token_tensor = torch.cuda.LongTensor(eo_tokens)  # noqa
+        context_length_tensor = torch.cuda.LongTensor([context_length])
+        context_length = context_length_tensor[0].item()
+
+        start_time = time.time()  # noqa
+
+        counter, mems = 0, []  # noqa
+        org_context_length = context_length  # noqa
+        completion_token_length = context_length
+        beam_size = 1
+        beam_candidate = 1
+        beam_max = 1  # noqa
+        max_headings = 4  # noqa
+        final_storage = []  # noqa
+        final_storage_score = []  # noqa
+        step = senlength + 1
+        if st is None:
+            st = 8
+        overall_score = []
+        past_beam_id = []
+
+        if length is not None:
+            res = generate_sentence(
+                model,
+                tokenizer,
+                args,
+                device,
+                context_tokens_tensor, [],
+                min_length=lycr - 1,
+                max_length=lycr,
+                num_candidates=beam_size)
+            beam_sentences = res.get('result', [])
+            completion_token_length = res.get('output_tokens_length', 0)
+        else:
+            res = generate_sentence(
+                model,
+                tokenizer,
+                args,
+                device,
+                context_tokens_tensor, [],
+                min_length=lycr - 1,
+                max_length=lycr,
+                num_candidates=beam_size)
+            beam_sentences = res.get('result', [])
+            completion_token_length = res.get('output_tokens_length', 0)
+
+        if len(beam_sentences) == 0:
+            res = {}
+            res['prompt_token_num'] = context_length
+            res['completion_token_num'] = 0
+            res['text'] = ''
+            res['errmsg'] = '太难了，写不出来。'
+            raise CanNotReturnException('太难了，写不出来。', res)
+
+        for i in range(step):
+            beam_new_sentences = []
+
+            endnote = [',', '，', '?', '？']
+            if i % 2 == 0:
+                endnote = ['。', '?', '？', '！', '!']
+            overall_score = []  # noqa
+            past_beam_id = []  # noqa
+            id = 0
+            current_sentence = input_str + beam_sentences[0][0]
+
+            ini_score = beam_sentences[id][1]  # noqa
+            token_tensor = beam_sentences[id][2]
+            mems = beam_sentences[id][3]
+
+            len_sentence = getlength(beam_sentences[id][0])  # noqa
+
+            res = generate_sentence(
+                model,
+                tokenizer,
+                args,
+                device,
+                token_tensor,
+                mems,
+                num_candidates=beam_candidate,
+                endnote=endnote,
+                min_length=lycr - 1,
+                max_length=lycr,
+                yayun=beam_sentences[id][-1])
+            gen = res.get('result', [])
+            completion_token_length = res.get('output_tokens_length', 0)
+            if len(gen) == 0:
+                res = {}
+                res['prompt_token_num'] = context_length
+                res['completion_token_num'] = context_length
+                res['text'] = ''
+                res['errmsg'] = '太难了，写不出来。'
+                raise CanNotReturnException('太难了，写不出来。', res)
+            jj = gen[0]
+            if ('<|end' in jj[0] or i == senlength - 1):
+                if (i % 2 == 1 and i > -3):
+                    del beam_sentences
+                    del beam_new_sentences
+                    torch.cuda.empty_cache()
+                    res = {}
+                    res['prompt_token_num'] = context_length
+                    res['completion_token_num'] = completion_token_length
+                    res['text'] = current_sentence
+                    return res
+                else:
+                    res = generate_sentence(
+                        model,
+                        tokenizer,
+                        args,
+                        device,
+                        token_tensor,
+                        mems,
+                        num_candidates=beam_candidate,
+                        endnote=endnote,
+                        min_length=lycr - 1,
+                        max_length=lycr,
+                        yayun=beam_sentences[id][-1])
+                    gen = res.get('result', [])
+                    completion_token_length = res.get('output_tokens_length',
+                                                      0)
+
+            if len(gen) == 0:
+                res = {}
+                res['prompt_token_num'] = context_length
+                res['completion_token_num'] = 0
+                res['text'] = ''
+                res['errmsg'] = '太难了，写不出来。'
+                raise CanNotReturnException('太难了，写不出来。', res)
+            st = jj[0]
+            # experiment shows that this is better universal,
+
+            jj[0] = beam_sentences[id][0] + jj[0]
+            jj[1] = 0
+            beam_new_sentences.append(jj)
+            del beam_sentences
+            torch.cuda.empty_cache()
+            beam_sentences = beam_new_sentences
+
+            # parallel ends
+
+        del beam_sentences
+        del beam_new_sentences
+        torch.cuda.empty_cache()
+        res = {}
+        res['prompt_token_num'] = context_length
+        res['completion_token_num'] = 0
+        res['text'] = ''
+        res['errmsg'] = '太难了，写不出来。'
+        raise CanNotReturnException('太难了，写不出来。', res)
+
+
+def prepare_tokenizer(args):
+    tokenizer_args = {
+        'tokenizer_type': args.tokenizer_type,
+        'corpus': None,
+        'model_path': args.tokenizer_path,
+        'vocab_size': args.vocab_size,
+        'model_type': args.tokenizer_model_type,
+        'cache_dir': args.cache_dir
+    }
+    tokenizer = make_tokenizer(**tokenizer_args)
+
+    num_tokens = tokenizer.num_tokens
+    before = num_tokens
+    after = before
+    multiple = args.make_vocab_size_divisible_by * \
+               mpu.get_model_parallel_world_size() # noqa
+    while (after % multiple) != 0:
+        after += 1
+    print_rank_0('> padded vocab (size: {}) with {} dummy '
+                 'tokens (new size: {})'.format(before, after - before, after))
+
+    args.tokenizer_num_tokens = after
+    args.tokenizer_num_type_tokens = tokenizer.num_type_tokens
+    args.eod_token = tokenizer.get_command('eos').Id
+
+    args.vocab_size = after
+    print('prepare tokenizer done', flush=True)
+
+    return tokenizer
+
+
+def set_args():
+    args = get_args()
+    args.deepspeed = True
+    args.num_nodes = 1
+    args.num_gpus = 1
+    args.model_parallel_size = 1
+    args.num_layers = 32
+    args.hidden_size = 2560
+    args.load = 'modelscope-txl/'
+    args.num_attention_heads = 32
+    args.max_position_embeddings = 1024
+    args.tokenizer_type = 'ChineseSPTokenizer'
+    args.cache_dir = 'cache'
+    args.fp16 = True
+    args.out_seq_length = 180
+    args.seq_length = 200
+    args.mem_length = 256
+    args.transformer_xl = True
+    args.temperature = 1.2
+    args.top_k = 0
+    args.top_p = 0
+
+    return args
+
+
+def prepare_model(model_dir):
+    """Main training program."""
+
+    # Disable CuDNN.
+    torch.backends.cudnn.enabled = False
+
+    # Timer.
+    timers = Timers()  # noqa
+
+    # Arguments.
+    args = set_args()
+    args.load = model_dir
+    args.mem_length = args.seq_length + args.mem_length - 1
+
+    # Pytorch distributed.
+    initialize_distributed(args)
+
+    # Random seeds for reproducability.
+    args.seed = random.randint(0, 1000000)
+    set_random_seed(args.seed)
+
+    # get the tokenizer
+    args.tokenizer_path = model_dir
+    tokenizer = prepare_tokenizer(args)
+
+    # Model, optimizer, and learning rate.
+    model = setup_model(args)
+
+    # setting default batch size to 1
+    args.batch_size = 1
+
+    # generate samples
+    return model, tokenizer, args
+
+
+def fast_poem(content, model, tokenizer, args):
+    title = content['title']
+    author = content['author']
+    desc = content['desc']
+    lycr = content['lycr']
+    senlength = content['senlength']
+
+    res = generate_string(
+        model,
+        tokenizer,
+        args,
+        torch.cuda.current_device(),
+        title,
+        author,
+        desc=desc,
+        lycr=lycr,
+        senlength=senlength)
+
+    return res
--- a/modelscope/models/nlp/txl_poem/gpt2/configure_data.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/configure_data.py
@@ -0,0 +1,263 @@
+# Modified by Zhipu.AI
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""parses arguments and preps data loader"""
+
+import copy
+
+import torch
+
+from . import data_utils, mpu
+
+
+class DataConfig:
+
+    def __init__(self, defaults={}):
+        super(DataConfig, self).__init__()
+        self.defaults = defaults
+
+    def apply(self, args):
+        if torch.distributed.get_rank() == 0:
+            print('configuring data')
+        self.apply_defaults(args)
+        return make_loaders(args)
+
+    def set_defaults(self, **kwargs):
+        for k, v in kwargs.items():
+            self.defaults[k] = v
+
+    def apply_defaults(self, args):
+        for k, v in self.defaults.items():
+            k = k.replace('-', '_')
+            if not hasattr(args, k):
+                setattr(args, k, v)
+
+
+def make_data_loader(dataset, batch_size, args):
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
+    distributed = world_size > 1
+    if args.transformer_xl:
+        batch_sampler = data_utils.samplers.DistributedSequentialSampler(
+            len(dataset), args.train_iters, batch_size, rank, world_size)
+    else:
+        shuffle = args.shuffle
+        if shuffle:
+            sampler = data_utils.samplers.RandomSampler(
+                dataset,
+                replacement=True,
+                num_samples=batch_size * args.train_iters)
+        else:
+            sampler = torch.utils.data.SequentialSampler(dataset)
+        drop_last = distributed
+        # the GPUs in the same model parallel group receive the same data
+        if distributed:
+            batch_sampler = data_utils.samplers.DistributedBatchSampler(
+                sampler,
+                batch_size,
+                drop_last,
+                rank,
+                world_size,
+                gradient_accumulation_steps=args.gradient_accumulation_steps)
+        else:
+            batch_sampler = torch.utils.data.BatchSampler(
+                sampler, batch_size, drop_last)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_sampler=batch_sampler,
+        num_workers=args.num_workers,
+        pin_memory=True)
+
+    return data_loader
+
+
+def make_tfrecord_loaders(args):
+    """Load train/val/test dataset from shuffled TFRecords"""
+
+    import data_utils.tf_dl
+    data_set_args = {
+        'batch_size': args.batch_size,
+        'max_seq_len': args.seq_length,
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'train': True,
+        'num_workers': max(args.num_workers, 1),
+        'seed': args.seed + args.rank + 1,
+        'threaded_dl': args.num_workers > 0
+    }
+    train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
+                                                **data_set_args)
+    data_set_args['train'] = False
+    if args.eval_seq_length is not None:
+        data_set_args['max_seq_len'] = args.eval_seq_length
+    if args.eval_max_preds_per_seq is not None:
+        data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
+    valid = None
+    if args.valid_data is not None:
+        valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
+                                                    **data_set_args)
+    test = None
+    if args.test_data is not None:
+        test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
+                                                   **data_set_args)
+    tokenizer = data_utils.make_tokenizer(
+        args.tokenizer_type,
+        train,
+        args.tokenizer_path,
+        args.vocab_size,
+        args.tokenizer_model_type,
+        cache_dir=args.cache_dir)
+
+    return (train, valid, test), tokenizer
+
+
+def make_loaders(args):
+    """makes training/val/test"""
+
+    if args.use_tfrecords:
+        return make_tfrecord_loaders(args)
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    batch_size = args.batch_size * world_size
+    eval_batch_size = batch_size
+    if args.eval_batch_size is not None:
+        eval_batch_size = args.eval_batch_size * world_size
+    seq_length = args.seq_length
+    if seq_length < 0:
+        seq_length = seq_length * world_size
+    eval_seq_length = args.eval_seq_length
+    if eval_seq_length is not None and eval_seq_length < 0:
+        eval_seq_length = eval_seq_length * world_size
+    split = get_split(args)
+    data_set_args = {
+        'local_rank': args.local_rank,
+        'path': args.train_data,
+        'seq_length': seq_length,
+        'mem_length': args.mem_length,
+        'lazy': args.lazy_loader,
+        'xl_style': args.transformer_xl,
+        'delim': args.delim,
+        'text_key': args.text_key,
+        'label_key': 'label',
+        'non_binary_cols': None,
+        'ds_type': args.data_set_type,
+        'split': split,
+        'loose': args.loose_json,
+        'tokenizer_type': args.tokenizer_type,
+        'tokenizer_model_path': args.tokenizer_path,
+        'vocab_size': args.vocab_size,
+        'model_type': args.tokenizer_model_type,
+        'cache_dir': args.cache_dir,
+        'max_preds_per_seq': args.max_preds_per_seq,
+        'presplit_sentences': args.presplit_sentences,
+        'sample_one_document': args.sample_one_document,
+        'pre_tokenize': not args.not_pre_tokenize
+    }
+
+    eval_set_args = copy.copy(data_set_args)
+    eval_set_args['split'] = [1.]
+    # if optional eval args were set then replace their
+    # equivalent values in the arg dict
+    if eval_seq_length:
+        eval_set_args['seq_length'] = eval_seq_length
+    if args.eval_max_preds_per_seq:
+        eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
+    if args.eval_text_key is not None:
+        eval_set_args['text_key'] = args.eval_text_key
+
+    # make datasets splits and tokenizer
+    train = None
+    valid = None
+    test = None
+
+    if args.train_data is not None:
+        train, tokenizer = data_utils.make_dataset(**data_set_args)
+        if data_utils.should_split(split):
+            train, valid, test = train
+        eval_set_args['tokenizer'] = tokenizer
+
+    # make training and val dataset if necessary
+    if valid is None and args.valid_data is not None:
+        eval_set_args['path'] = args.valid_data
+        valid, tokenizer = data_utils.make_dataset(**eval_set_args)
+        eval_set_args['tokenizer'] = tokenizer
+    if test is None and args.test_data is not None:
+        eval_set_args['path'] = args.test_data
+        test, tokenizer = data_utils.make_dataset(**eval_set_args)
+
+    # wrap datasets with data loader
+    if train is not None and args.batch_size > 0:
+        train = make_data_loader(train, batch_size, args)
+        args.do_train = True
+    else:
+        args.do_train = False
+    eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
+    if valid is not None:
+        valid = make_data_loader(valid, eval_batch_size, args)
+        args.do_valid = True
+    else:
+        args.do_valid = False
+    if test is not None:
+        test = make_data_loader(test, eval_batch_size, args)
+        args.do_test = True
+    else:
+        args.do_test = False
+
+    return (train, valid, test), tokenizer
+
+
+def get_split(args):
+    """
+    Get dataset splits from comma separated string list
+    """
+    splits = []
+    if args.split.find(',') != -1:
+        splits = [float(s) for s in args.split.split(',')]
+    elif args.split.find('/') != -1:
+        splits = [float(s) for s in args.split.split('/')]
+    else:
+        splits = [float(args.split)]
+    split_total = sum(splits)
+    if split_total < 1.:
+        splits.append(1 - split_total)
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    if args.valid_data is not None:
+        splits[1] = 0.
+    if args.test_data is not None:
+        splits[2] = 0.
+    final_sum = sum(splits)
+    return [s / final_sum for s in splits]
+
+
+def configure_data():
+    """add cmdline flags for configuring datasets"""
+    # These are options that are used by data_utils, but are either
+    # deprecated or not meant to be exposed to the command line user.
+    # These options are intneded to be set in code by specific scripts.
+    defaults = {
+        'world_size': 1,
+        'rank': -1,
+        'persist_state': 0,
+        'lazy': False,
+        'transpose': False,
+        'data_set_type': 'supervised',
+        'seq_length': 256,
+        'eval_seq_length': 256,
+        'samples_per_shard': 100
+    }
+
+    return DataConfig(defaults=defaults)
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/init.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/init.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for creating datasets"""
+import math
+import os
+import time
+
+from . import corpora
+from .datasets import (ConcatDataset, GPT2Dataset, ShuffleDataset,
+                       SplitDataset, XLDataset, bert_sentencepair_dataset,
+                       csv_dataset, json_dataset, split_ds)
+from .lazy_loader import LazyLoader, LazyWriter, exists_lazy
+from .samplers import DistributedBatchSampler
+from .tokenization import (BertWordPieceTokenizer, CharacterLevelTokenizer,
+                           CommandToken, GPT2BPETokenizer, Tokenization,
+                           Tokenizer, make_tokenizer)
+
+TRAIN_DATA = 0
+VAL_DATA = 1
+TEST_DATA = 2
+
+
+def should_split(split):
+    """
+    given split proportions checks if should split
+    Examples:
+    >>> should_split([10,0,0])
+    False
+    >>> should_split([1,.1,.2])
+    True
+    """
+    return max(split) / sum(split) != 1.
+
+
+def get_ext(path):
+    """gets path extension"""
+    return os.path.splitext(path)[1]
+
+
+def get_dataset(name, tokenizer, pre_tokenize, local_rank):
+    """gets dataset object based on keyword args and file at `path`"""
+    if supported_corpus(name):
+        dataset = corpora.NAMED_CORPORA[name]
+        path = dataset.PATH
+        if issubclass(dataset, corpora.PromptReader):
+            if not (exists_lazy(path, data_type='prompt')
+                    and exists_lazy(path, data_type='text')):
+                # create cached version of dataset for lazy loading if it doesn't exist
+                if local_rank == 0:
+                    prompt_writer = LazyWriter(
+                        path, data_type='prompt', is_array=pre_tokenize)
+                    text_writer = LazyWriter(
+                        path, data_type='text', is_array=pre_tokenize)
+                    writers = {'prompt': prompt_writer, 'text': text_writer}
+                    dataset(
+                        writers=writers,
+                        tokenizer=tokenizer,
+                        tokenize=pre_tokenize)
+                    prompt_writer.close()
+                    text_writer.close()
+                else:
+                    while not os.path.exists(
+                            LazyWriter.get_len_path(path, data_type='prompt')):
+                        time.sleep(1)
+            map_fn = (lambda x: x.tolist()) if pre_tokenize else None
+            prompts = LazyLoader(
+                path,
+                data_type='prompt',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize)
+            texts = LazyLoader(
+                path,
+                data_type='text',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize)
+            text = corpora.PromptDataset(
+                prompt_loader=prompts,
+                text_loader=texts,
+                tokenizer=tokenizer,
+                to_tokenize=not pre_tokenize)
+            return text
+        elif issubclass(dataset, corpora.KeyReader):
+            if not (exists_lazy(path, data_type='text')
+                    and exists_lazy(path, data_type='mask')):
+                # create cached version of dataset for lazy loading if it doesn't exist
+                if local_rank == 0:
+                    text_writer = LazyWriter(
+                        path, data_type='text', is_array=pre_tokenize)
+                    mask_writer = LazyWriter(
+                        path, data_type='mask', is_array=True)
+                    writers = {'mask': mask_writer, 'text': text_writer}
+                    dataset(
+                        writers=writers,
+                        tokenizer=tokenizer,
+                        tokenize=pre_tokenize)
+                    mask_writer.close()
+                    text_writer.close()
+                else:
+                    while not os.path.exists(
+                            LazyWriter.get_len_path(path, data_type='mask')):
+                        time.sleep(1)
+            map_fn = (lambda x: x.tolist()) if pre_tokenize else None
+            masks = LazyLoader(
+                path,
+                data_type='mask',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=True)
+            texts = LazyLoader(
+                path,
+                data_type='text',
+                map_fn=map_fn,
+                mem_map=True,
+                is_array=pre_tokenize)
+            text = corpora.KeyDataset(
+                mask_loader=masks,
+                text_loader=texts,
+                tokenizer=tokenizer,
+                to_tokenize=not pre_tokenize)
+            return text
+    else:
+        raise NotImplementedError('dataset %s is not supported' % name)
+
+
+def supported_corpus(corpus_name):
+    """checks if corpus name is defined in `corpora.py`"""
+    return corpus_name in corpora.NAMED_CORPORA
+
+
+def make_dataset(path,
+                 seq_length,
+                 mem_length,
+                 local_rank,
+                 lazy=False,
+                 xl_style=False,
+                 shuffle=True,
+                 split=None,
+                 tokenizer=None,
+                 tokenizer_type='CharacterLevelTokenizer',
+                 tokenizer_model_path=None,
+                 vocab_size=None,
+                 model_type='bpe',
+                 pad_token=0,
+                 character_converage=1.0,
+                 non_binary_cols=None,
+                 sample_one_document=False,
+                 pre_tokenize=False,
+                 **kwargs):
+    """function to create datasets+tokenizers for common options"""
+    if split is None:
+        split = [1.]
+    if non_binary_cols is not None:
+        # multilabel dataset support (only for csvs)
+        label_key = non_binary_cols  # noqa
+
+        # make tokenizer for dataset
+    if tokenizer is None:
+        tokenizer = make_tokenizer(tokenizer_type, None, tokenizer_model_path,
+                                   vocab_size, model_type, pad_token,
+                                   character_converage, **kwargs)
+
+    # get one or multiple datasets and concatenate
+    if isinstance(path, str):
+        ds = get_dataset(
+            path,
+            tokenizer=tokenizer,
+            pre_tokenize=pre_tokenize,
+            local_rank=local_rank)
+    else:
+        ds = [
+            get_dataset(
+                p,
+                tokenizer=tokenizer,
+                pre_tokenize=pre_tokenize,
+                local_rank=local_rank) for p in path
+        ]
+        ds = ConcatDataset(ds)
+
+    ds_type = ''
+    if 'ds_type' in kwargs:
+        ds_type = kwargs['ds_type']
+    # Split dataset into train/val/test (and wrap bert dataset)
+    if should_split(split):
+        ds = split_ds(ds, split, shuffle=shuffle)
+        if ds_type.lower() == 'bert':
+            presplit_sentences = kwargs[
+                'presplit_sentences'] if 'presplit_sentences' in kwargs else False
+            ds = [
+                bert_sentencepair_dataset(
+                    d,
+                    max_seq_len=seq_length,
+                    presplit_sentences=presplit_sentences)
+                if d is not None else None for d in ds
+            ]
+        elif ds_type.lower() == 'gpt2':
+            if xl_style:
+                ds = [
+                    XLDataset(
+                        d,
+                        tokenizer,
+                        max_seq_len=seq_length,
+                        mem_len=mem_length,
+                        sample_across_doc=not sample_one_document)
+                    if d is not None else None for d in ds
+                ]
+            else:
+                ds = [
+                    GPT2Dataset(
+                        d,
+                        tokenizer,
+                        max_seq_len=seq_length,
+                        sample_across_doc=not sample_one_document)
+                    if d is not None else None for d in ds
+                ]
+    else:
+        if ds_type.lower() == 'bert':
+            presplit_sentences = kwargs[
+                'presplit_sentences'] if 'presplit_sentences' in kwargs else False
+            ds = bert_sentencepair_dataset(
+                ds,
+                max_seq_len=seq_length,
+                presplit_sentences=presplit_sentences)
+        elif ds_type.lower() == 'gpt2':
+            if xl_style:
+                ds = XLDataset(
+                    ds,
+                    tokenizer,
+                    max_seq_len=seq_length,
+                    mem_len=mem_length,
+                    sample_across_doc=not sample_one_document)
+            else:
+                ds = GPT2Dataset(
+                    ds,
+                    tokenizer,
+                    max_seq_len=seq_length,
+                    sample_across_doc=not sample_one_document)
+    return ds, tokenizer
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/corpora.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/corpora.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""several datasets with preset arguments"""
+import os
+import random
+from multiprocessing import Process, Queue
+
+import json
+import tqdm
+from torch.utils import data
+
+from .datasets import csv_dataset, json_dataset
+from .lazy_loader import LazyLoader
+
+NUM_PROCESSES = 40
+
+
+class webtext(json_dataset):
+    """
+    dataset for webtext with arguments configured for convenience
+
+    command line usage: `--train-data webtext`
+    """
+    PATH = 'data/webtext/data.json'
+    assert_str = 'make sure to set PATH for webtext data_utils/corpora.py'
+
+    def __init__(self, **kwargs):
+        assert os.path.exists(webtext.PATH), \
+            webtext.assert_str
+        if not kwargs:
+            kwargs = {}
+        kwargs['text_key'] = 'text'
+        kwargs['loose_json'] = True
+        super(webtext, self).__init__(webtext.PATH, **kwargs)
+
+
+class KeyDataset(data.Dataset):
+
+    def __init__(self, text_loader, mask_loader, **kwargs):
+        self.texts = text_loader
+        self.masks = mask_loader
+        self.is_lazy = False
+        if isinstance(self.texts, LazyLoader) and isinstance(
+                self.masks, LazyLoader):
+            self.text_lens = self.texts.lens
+            self.is_lazy = True
+
+    def get_text_len(self, idx):
+        return self.text_lens[idx]
+
+    def __getitem__(self, index):
+        text = self.texts[index]
+        mask_length = self.masks[index]
+        mask = []
+        for i, length in enumerate(mask_length):
+            if i % 2 == 0:
+                mask += [0] * length
+            else:
+                mask += [1] * length
+        assert len(text) == len(mask)
+        return {'tokens': text, 'loss_masks': mask}
+
+    def __len__(self):
+        return len(self.texts)
+
+
+class PromptDataset(data.Dataset):
+
+    def __init__(self,
+                 prompt_loader,
+                 text_loader,
+                 tokenizer=None,
+                 to_tokenize=False,
+                 **kwargs):
+        self.prompts = prompt_loader
+        self.texts = text_loader
+        self.tokenizer = tokenizer
+        self.to_tokenize = to_tokenize
+        if isinstance(self.prompts, LazyLoader) and isinstance(
+                self.texts, LazyLoader):
+            self.prompt_lens = self.prompts.lens
+            self.text_lens = self.texts.lens
+            self.is_lazy = True
+
+    def get_text_len(self, idx):
+        return self.prompt_lens[idx] + self.text_lens[idx]
+
+    def __getitem__(self, index):
+        prompt = self.prompts[index]
+        text = self.texts[index]
+        if self.to_tokenize:
+            prompt = self.tokenizer.EncodeAsIds(prompt).tokenization
+            text = self.tokenizer.EncodeAsIds(text).tokenization
+        return {
+            'tokens': prompt + text,
+            'loss_masks': [0] * len(prompt) + [1] * len(text)
+        }
+
+    def __len__(self):
+        return len(self.prompts)
+
+
+class DataReader:
+    PATH = None
+    assert_str = None
+
+    @staticmethod
+    def tokenize_worker(input, output, reader, tokenizer, tokenize):
+        raise NotImplementedError
+
+    def __init__(self, writers, tokenizer=None, tokenize=False, **kwargs):
+        assert os.path.exists(self.PATH), self.assert_str
+        self.tokenizer = tokenizer
+        self.tokenize = tokenize
+        self.writers = writers
+        if os.path.isdir(self.PATH):
+            paths = [
+                entry.path for entry in os.scandir(self.PATH)
+                if not entry.is_dir() and not entry.name.endswith('bz2')
+            ]
+        else:
+            paths = [self.PATH]
+        task_queue, done_queue = Queue(), Queue()
+        processes = []
+        for i in range(NUM_PROCESSES):
+            process = Process(
+                target=self.tokenize_worker,
+                args=(task_queue, done_queue, type(self), tokenizer, tokenize))
+            process.start()
+            processes.append(process)
+        for path in paths:
+            with open(path) as file:
+                for row in tqdm.tqdm(file):
+                    task_queue.put(row)
+        for i in range(len(processes)):
+            task_queue.put('STOP')
+        count = len(processes)
+        progress_bar = tqdm.tqdm()
+        while True:
+            data = done_queue.get()
+            if data == 'COMPLETE':
+                count -= 1
+                if count == 0:
+                    break
+            else:
+                self.write_result(data, self.writers)
+                progress_bar.update()
+        progress_bar.close()
+
+    @staticmethod
+    def write_result(data, writers):
+        raise NotImplementedError
+
+    @staticmethod
+    def get_token_count(contents):
+        return sum(map(len, contents))
+
+    @staticmethod
+    def process_sample(text, tokenizer, tokenize):
+        if isinstance(text, str) and tokenize:
+            text = tokenizer.EncodeAsIds(text).tokenization if text else []
+        return text
+
+    @staticmethod
+    def trim_field(content, max_length):
+        if len(content) > max_length:
+            content = content[:max_length]
+            content += '......'
+        return content
+
+    @classmethod
+    def process_line(cls, data, tokenizer, tokenize):
+        raise NotImplementedError
+
+
+class PromptReader(DataReader):
+
+    @staticmethod
+    def tokenize_worker(input, output, reader, tokenizer, tokenize):
+        for row in iter(input.get, 'STOP'):
+            data = json.loads(row)
+            prompts, texts = reader.process_line(data, tokenizer, tokenize)
+            for prompt, text in zip(prompts, texts):
+                output.put((prompt, text))
+        output.put('COMPLETE')
+
+    @staticmethod
+    def write_result(data, writers):
+        prompt, text = data
+        writers['prompt'].write(prompt)
+        writers['text'].write(text)
+
+
+class KeyReader(DataReader):
+    PATH = '/root/data/wikipedia/wiki-key.txt'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    @classmethod
+    def process_line(cls, data, tokenizer, tokenize):
+        keys, contents = data['key'], data['content']
+        assert len(keys) == len(contents)
+        for i in range(1, len(keys)):
+            keys[i] = ' ' + keys[i]
+        contents = [' ' + content for content in contents]
+        keys = [tokenizer.EncodeAsIds(key).tokenization for key in keys]
+        contents = [
+            tokenizer.EncodeAsIds(content).tokenization for content in contents
+        ]
+        summary = sum(keys, [])
+        summary_prefix = cls.process_sample('Summary: ', tokenizer, tokenize)
+        summary_mask = [len(summary_prefix), len(summary)]
+        summary = summary_prefix + summary
+        text, text_mask = [], []
+        for key, content in zip(keys, contents):
+            text += key
+            text += content
+            text_mask.append(len(key))
+            text_mask.append(len(content))
+        return (summary, summary_mask), (text, text_mask)
+
+    @staticmethod
+    def tokenize_worker(input, output, reader, tokenizer, tokenize):
+        for row in iter(input.get, 'STOP'):
+            data = json.loads(row)
+            summary, content = reader.process_line(data, tokenizer, tokenize)
+            output.put((summary, content))
+        output.put('COMPLETE')
+
+    @staticmethod
+    def write_result(data, writers):
+        summary, content = data
+        writers['text'].write(summary[0])
+        writers['mask'].write(summary[1])
+        writers['text'].write(content[0])
+        writers['mask'].write(content[1])
+
+
+class zhihu(PromptReader):
+    PATH = '/root/data/zhihu/zhihu'
+    # PATH = "data/zhihu/data.json"
+    assert_str = 'make sure to set PATH for zhihu data_utils/corpora.py'
+    qtitle_prefix = '问题：'
+    qcontent_prefix = '问题描述：'
+    user_prefix = '回答用户：'
+    answer_prefix = ' 回答：'
+
+    # qtitle_prefix = []
+    # qcontent_prefix = []
+    # user_prefix = []
+    # answer_prefix = []
+
+    @classmethod
+    def process_line(cls, data, tokenizer, tokenize):
+        prompts, texts = [], []
+        ans_length = len(data.get('ans-content', ''))
+        ans_up = data.get('ans-up-num', '')
+        ans_up = int(ans_up) if ans_up else 0
+        if ans_length > 100 or ans_up > 1000:
+            qtitle = data['q_title']
+            qcontent = data['q-content']
+            if qcontent is None:
+                qcontent = ''
+            qcontent = cls.trim_field(qcontent, max_length=100)
+            user = data.get('user-signature', '')
+            prompt = cls.qtitle_prefix + qtitle + cls.qcontent_prefix + qcontent + cls.user_prefix + user + cls.answer_prefix  # noqa
+            text = data['ans-content']
+            prompt, text = cls.process_sample(prompt, tokenizer,
+                                              tokenize), cls.process_sample(
+                                                  text, tokenizer, tokenize)
+            prompts.append(prompt)
+            texts.append(text)
+        # prompt = data["q_title"] + data["q-content"] + data["user-signature"]
+        # text = data["ans-content"]
+        # prompts.append(prompt)
+        # texts.append(text)
+        return prompts, texts
+
+
+class zhidao(PromptReader):
+    PATH = '/root/data/zhidao/zhidao'
+    assert_str = 'make sure to set PATH for zhidao data_utils/corpora.py'
+    qtitle_prefix = '问题：'
+    qcontent_prefix = '问题描述：'
+    answer_prefix = '回答：'
+
+    @classmethod
+    def process_line(cls, data, tokenizer, tokenize):
+        if 'title' not in data:
+            return [], []
+        prompts, texts = [], []
+        qtitle = data['title']
+        qcontent = data.get('content', '')
+        qcontent = cls.trim_field(qcontent, max_length=100)
+        prompt = cls.qtitle_prefix + qtitle + cls.qcontent_prefix + qcontent + cls.answer_prefix
+        prompt = cls.process_sample(prompt, tokenizer, tokenize)
+        if 'best_answer' in data:
+            text = data['best_answer']['content']
+            if len(text) > 10:
+                text = cls.process_sample(text, tokenizer, tokenize)
+                prompts.append(prompt)
+                texts.append(text)
+        for answer in data.get('other_answers', []):
+            text = answer['content']
+            if len(text) > 100:
+                text = cls.process_sample(text, tokenizer, tokenize)
+                prompts.append(prompt)
+                texts.append(text)
+        return prompts, texts
+
+
+class baike(PromptReader):
+    PATH = '/root/data/baike/baike'
+    assert_str = 'make sure to set PATH for baike data_utils/corpora.py'
+
+    @classmethod
+    def process_line(cls, data, tokenizer, tokenize):
+        prompts, texts = [], []
+        text = data.get('title', '') + data.get('abstract', '') + data.get(
+            'content', '')
+        if text:
+            p, t = cls.process_sample('', tokenizer,
+                                      tokenize), cls.process_sample(
+                                          text, tokenizer, tokenize)
+            prompts.append(p)
+            texts.append(t)
+        return prompts, texts
+
+
+class wikipedia(PromptReader):
+    """
+    dataset for wikipedia with arguments configured for convenience
+
+    command line usage: `--train-data wikipedia`
+    """
+    # PATH = '/dataset/data/wiki.txt'
+    PATH = '/root/data/wikipedia/wiki.txt'
+    assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
+
+    @classmethod
+    def process_line(cls, data, tokenizer, tokenize):
+        text = data['text']
+        prompt, text = cls.process_sample('', tokenizer,
+                                          tokenize), cls.process_sample(
+                                              text, tokenizer, tokenize)
+        return [prompt], [text]
+
+
+NAMED_CORPORA = {
+    'wikipedia': wikipedia,
+    'wikipedia-key': KeyReader,
+    'webtext': webtext,
+    'zhihu': zhihu,
+    'zhidao': zhidao,
+    'baike': baike
+}
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/datasets.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/datasets.py
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/extraction.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/extraction.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022 Zhipu.AI
+import glob
+import os
+
+import json
+import nltk
+
+nltk.download('punkt')
+
+
+class NLTKSegmenter:
+
+    def __init(self):
+        pass
+
+    @staticmethod
+    def segment_string(article):
+        return nltk.tokenize.sent_tokenize(article)
+
+
+wiki_path = 'data/extracted'
+output_path = 'formatted/wiki-key.txt'
+segmenter = NLTKSegmenter()
+with open(output_path, 'w') as output:
+    for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
+        for filename in glob.glob(
+                os.path.join(dirname, 'wiki_*'), recursive=True):
+            print(filename)
+            article_lines = []
+            article_open = False
+            with open(filename, mode='r', newline='\n') as file:
+                for line in file:
+                    line = line.rstrip()
+                    if '<doc id=' in line:
+                        article_open = True
+                    elif '</doc>' in line:
+                        key_sentences, contents = [], []
+                        key, content = None, []
+                        for sentences in article_lines[1:]:
+                            if len(sentences) > 1:
+                                if key:
+                                    if len(content) > 0 or len(contents) == 0:
+                                        key_sentences.append(key)
+                                        contents.append(content)
+                                    else:
+                                        contents[-1].append(key)
+                                    key, content = None, []
+                                key_sentences.append(sentences[0])
+                                contents.append(sentences[1:])
+                            elif len(sentences) > 0:
+                                if key:
+                                    content.append(sentences[0])
+                                else:
+                                    key = sentences[0]
+                        if key:
+                            if len(content) > 0 or len(contents) == 0:
+                                key_sentences.append(key)
+                                contents.append(content)
+                            else:
+                                contents[-1].append(key)
+                        contents = [' '.join(content) for content in contents]
+                        article = {'key': key_sentences, 'content': contents}
+                        output.write(json.dumps(article))
+                        output.write('\n')
+                        article_open = False
+                        article_lines = []
+                    else:
+                        if article_open and line:
+                            sentences = segmenter.segment_string(line)
+                            article_lines.append(sentences)
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/file_utils.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/file_utils.py
@@ -0,0 +1,259 @@
+# This file is provided as is from:
+#   https://github.com/huggingface/pytorch-pretrained-BERT
+# Please refer to their repository for copyright.
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import os
+import shutil
+import sys
+import tempfile
+from functools import wraps
+from hashlib import sha256
+from io import open
+
+import boto3
+import json
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                  Path.home() / '.pytorch_pretrained_bert'))
+except (AttributeError, ImportError):
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+        'PYTORCH_PRETRAINED_BERT_CACHE',
+        os.path.join(os.path.expanduser('~'), '.pytorch_pretrained_bert'))
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError('file {} not found'.format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError('file {} not found'.format(meta_path))
+
+    with open(meta_path, encoding='utf-8') as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError('file {} not found'.format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError(
+            'unable to parse {} as a URL or as a local path'.format(
+                url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError('bad s3 path {}'.format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith('/'):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response['Error']['Code']) == 404:
+                raise EnvironmentError('file {} not found'.format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource('s3')
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource('s3')
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit='B', total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith('s3://'):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError(
+                'HEAD request failed for url {} with status code {}'.format(
+                    url, response.status_code))
+        etag = response.headers.get('ETag')
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info('%s not found in cache, downloading to %s', url,
+                        temp_file.name)
+
+            # GET file object
+            if url.startswith('s3://'):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info('copying %s to cache at %s', temp_file.name,
+                        cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info('creating metadata file for %s', cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding='utf-8') as meta_file:
+                json.dump(meta, meta_file)
+
+            logger.info('removing temp file %s', temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/lazy_loader.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/lazy_loader.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for loading text from disk"""
+import mmap
+import os
+import pickle as pkl
+import time
+from itertools import accumulate
+
+import numpy as np
+import torch
+from torch.multiprocessing import Lock
+
+
+def get_lazy_path(path):
+    """
+    Gets directory path where lazy files are stored.
+    """
+    return os.path.splitext(path)[0] + '.lazy'
+
+
+def exists_lazy(path, data_type='data'):
+    """
+    Check if we've already made a lazy version of this file for the `data_type` field.
+    """
+    if not os.path.exists(get_lazy_path(path)):
+        return False
+    contents = os.listdir(get_lazy_path(path))
+    if data_type not in contents:
+        return False
+    if data_type + '.len.pkl' not in contents:
+        return False
+    return True
+
+
+class LazyWriter:
+
+    def __init__(self,
+                 path,
+                 data_type,
+                 is_array=False,
+                 array_data_type=np.int32):
+        lazypath = get_lazy_path(path)
+        if not os.path.exists(lazypath):
+            os.makedirs(lazypath)
+        self.datapath = os.path.join(lazypath, data_type)
+        self.lenpath = os.path.join(lazypath, data_type + '.len.pkl')
+        self.array_data_type = array_data_type
+        self.output = open(self.datapath, 'wb')
+        self.lengths = []
+        self.is_array = is_array
+
+    @staticmethod
+    def get_len_path(path, data_type):
+        lazypath = get_lazy_path(path)
+        return os.path.join(lazypath, data_type + '.len.pkl')
+
+    def write(self, s):
+        if isinstance(s, dict):
+            s = s['text']
+        if self.is_array:
+            encoded = np.array(
+                s, dtype=self.array_data_type).tobytes(order='C')
+            self.output.write(encoded)
+            self.lengths.append(len(s))
+        else:
+            encoded = s.encode('utf-8')
+            self.output.write(encoded)
+            self.lengths.append(len(encoded))
+
+    def close(self):
+        self.output.close()
+        with open(self.lenpath, 'wb') as f:
+            pkl.dump(self.lengths, f)
+
+
+def split_strings(strings, start, chr_lens):
+    """
+    Split strings based on string lengths and given start.
+    """
+    return [
+        strings[i - start:j - start]
+        for i, j in zip([start] + chr_lens[:-1], chr_lens)
+    ]
+
+
+class ProcessorTokenizer:
+    """
+    callable class that runs a preprocessing, as well as tokenization step,
+    on input text.
+    """
+
+    def __init__(self, tokenizer, process_fn=None):
+        self.tokenizer = tokenizer
+        self.process_fn = process_fn
+
+    def __call__(self, string):
+        if self.tokenizer is not None:
+            string = self.tokenizer(string, process_fn=self.process_fn)
+        elif self.process_fn is not None:
+            string = self.process_fn(string)
+        return string
+
+
+class LazyLoader(object):
+    """
+    Arguments:
+        path: path to directory where array entries are concatenated into one big string file
+            and the .len file are located
+        data_type (str): Some datsets have multiple fields that are stored in different paths.
+            `data_type` specifies which of these fields to load in this class
+        mem_map  (boolean): Specifies whether to memory map file `path`
+        map_fn (callable): Fetched strings are passed through map_fn before being returned.
+
+    Example of lazy loader directory structure:
+    file.json
+    file.lazy/
+        data_type1
+        data_type1.len.pkl
+        data_type2
+        data_type2.len.pkl
+    """
+
+    def __init__(self,
+                 path,
+                 data_type='data',
+                 mem_map=False,
+                 map_fn=None,
+                 is_array=False,
+                 array_data_type=np.int32):
+        lazypath = get_lazy_path(path)
+        datapath = os.path.join(lazypath, data_type)
+        # get file where array entries are concatenated into one big string
+        self._file = open(datapath, 'rb')
+        self.file = self._file
+        self.is_array = is_array
+        self.array_data_type = array_data_type
+        # memory map file if necessary
+        lenpath = os.path.join(lazypath, data_type + '.len.pkl')
+        self.lens = pkl.load(open(lenpath, 'rb'))
+        self.ends = list(accumulate(self.lens))
+        self.dumb_ends = list(self.ends)
+        self.mem_map = mem_map
+        if self.mem_map:
+            if is_array:
+                if self.ends[-1] == 0:
+                    self.file = np.array([], dtype=array_data_type)
+                else:
+                    self.file = np.memmap(
+                        self.file, dtype=array_data_type, mode='r', order='C')
+            else:
+                if self.ends[-1] == 0:
+                    self.file = bytearray()
+                else:
+                    self.file = mmap.mmap(
+                        self.file.fileno(), 0, prot=mmap.PROT_READ)
+        self.read_lock = Lock()
+        self.process_fn = map_fn
+        self.map_fn = map_fn
+        self._tokenizer = None
+        self.is_lazy = True
+
+    def SetTokenizer(self, tokenizer):
+        """
+        logic to set and remove (set to None) tokenizer.
+        combines preprocessing/tokenization into one callable.
+        """
+        if tokenizer is None:
+            if not hasattr(self, '_tokenizer'):
+                self._tokenizer = tokenizer
+        else:
+            self._tokenizer = tokenizer
+        self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
+
+    def GetTokenizer(self):
+        return self._tokenizer
+
+    def __getitem__(self, index):
+        """
+        read file and splice strings based on string ending array `self.ends`
+        """
+        if not isinstance(index, slice):
+            if index == 0:
+                start = 0
+            else:
+                start = self.ends[index - 1]
+            end = self.ends[index]
+            rtn = self.file_read(start, end)
+            if self.map_fn is not None:
+                return self.map_fn(rtn)
+        else:
+            # if slice, fetch strings with 1 diskread and then splice in memory
+            chr_lens = self.ends[index]
+            if index.start == 0 or index.start is None:
+                start = 0
+            else:
+                start = self.ends[index.start - 1]
+            stop = chr_lens[-1]
+            strings = self.file_read(start, stop)
+            rtn = split_strings(strings, start, chr_lens)
+            if self.map_fn is not None:
+                return self.map_fn([s for s in rtn])
+        return rtn
+
+    def __len__(self):
+        return len(self.ends)
+
+    def file_read(self, start=0, end=None):
+        """read specified portion of file"""
+        data_type_size = np.dtype(self.array_data_type).itemsize
+        # atomic reads to avoid race conditions with multiprocess dataloader
+        self.read_lock.acquire()
+        if not self.mem_map:
+            # seek to start of file read
+            if self.is_array:
+                start = start * data_type_size
+                end = end * data_type_size if end is not None else None
+            self.file.seek(start)
+            # read to end of file if no end point provided
+            if end is None:
+                rtn = self.file.read()
+            # else read amount needed to reach end point
+            else:
+                rtn = self.file.read(end - start)
+            if self.is_array:
+                rtn = np.ndarray(
+                    shape=(len(rtn) / data_type_size, ),
+                    dtype=self.array_data_type,
+                    buffer=rtn,
+                    order='C')
+            else:
+                rtn = rtn.decode('utf-8', 'ignore')
+        else:
+            rtn = self.file[start:end]
+            if self.is_array:
+                rtn = rtn.copy()
+            else:
+                rtn = rtn.decode('utf-8', 'strict')
+        self.read_lock.release()
+        # TODO: @raulp figure out mem map byte string bug
+        # if mem map'd need to decode byte string to string
+        return rtn
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/samplers.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/samplers.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""batch samplers that work with either random or sequential data samplers"""
+import math
+import os
+import sys
+
+import numpy as np
+import torch
+from torch.utils import data
+
+
+class RandomSampler(data.sampler.Sampler):
+    r"""
+    Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
+    but this class lets the user set an epoch like DistributedSampler
+    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify ``num_samples`` to draw.
+    Arguments:
+        data_source (Dataset): dataset to sample from
+        num_samples (int): number of samples to draw, default=len(dataset)
+        replacement (bool): samples are drawn with replacement if ``True``, default=False
+    """
+
+    def __init__(self, data_source, replacement=False, num_samples=None):
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.epoch = -1
+
+        if self._num_samples is not None and replacement is False:
+            raise ValueError(
+                'With replacement=False, num_samples should not be specified, '
+                'since a random permute will be performed.')
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError('num_samples should be a positive integer '
+                             'value, but got num_samples={}'.format(
+                                 self.num_samples))
+        if not isinstance(self.replacement, bool):
+            raise ValueError('replacement should be a boolean value, but got '
+                             'replacement={}'.format(self.replacement))
+
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        g = torch.Generator()
+        if self.epoch >= 0:
+            g.manual_seed(self.epoch)
+        if self.replacement:
+            return iter(
+                torch.randint(
+                    high=n,
+                    size=(self.num_samples, ),
+                    dtype=torch.int64,
+                    generator=g).tolist())
+        return iter(torch.randperm(n, generator=g).tolist())
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class DistributedSequentialSampler(data.sampler.Sampler):
+
+    def __init__(self,
+                 num_samples,
+                 train_iters,
+                 batch_size,
+                 rank=-1,
+                 world_size=2):
+        super().__init__(num_samples)
+        if rank == -1:
+            rank = 0
+            world_size = 1
+        self.num_samples = num_samples
+        self.rank = rank
+        self.world_size = world_size
+        self.start_iter = 0
+        self.train_iters = train_iters
+        self.batch_size = batch_size
+        self.batch_bias = [
+            i * (num_samples // batch_size) for i in range(batch_size)
+        ]
+
+    def __iter__(self):
+        for idx in range(self.start_iter, self.train_iters * 10):
+            batch = [(idx + bias) % self.num_samples
+                     for bias in self.batch_bias]
+            tbatch = self._batch(batch)
+            yield tbatch
+
+    def __len__(self):
+        return self.train_iters
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
+
+
+class DistributedBatchSampler(data.sampler.BatchSampler):
+    """
+    similar to normal implementation of distributed sampler, except implementation is at the
+    batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
+    data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
+    """
+
+    def __init__(self,
+                 sampler,
+                 batch_size,
+                 drop_last,
+                 rank=-1,
+                 world_size=2,
+                 wrap_last=False,
+                 gradient_accumulation_steps=None):
+        super(DistributedBatchSampler, self).__init__(sampler, batch_size,
+                                                      drop_last)
+        if rank == -1:
+            assert False, 'should not be here'
+        self.rank = rank
+        self.world_size = world_size
+        self.sampler.wrap_around = 0
+        self.wrap_around = 0
+        self.wrap_last = wrap_last
+        self.start_iter = 0
+        self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps  # noqa
+
+    def __iter__(self):
+        batch = []
+        i = 0
+        for idx in self.data_iterator(self.sampler, wrap_around=False):
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                tbatch = self._batch(batch)
+                if i >= self.start_iter * self.effective_batch_size:
+                    yield tbatch
+                    self.start_iter = 0
+                i += len(batch)
+                batch = []
+        batch_len = len(batch)
+        if batch_len > 0 and not self.drop_last:
+            if self.wrap_last:
+                self.sampler.wrap_around -= (self.batch_size)
+                self.wrap_around += (len(batch))
+                self.wrap_around %= self.batch_size
+                if isinstance(self.sampler, TransposedSampler):
+                    for i, idx in enumerate(
+                            self.data_iterator(self.sampler,
+                                               wrap_around=True)):
+                        if i == 0:
+                            continue
+                        batch.append(idx)
+                        new_batch_len = len(batch)  # noqa
+                        if len(batch) == self.batch_size:
+                            break
+            yield self._batch(batch)
+        if self.wrap_last:
+            self.sampler.wrap_around += self.batch_size
+
+    def data_iterator(self, _iter, wrap_around=False):
+        """iterates through data and handles wrap around"""
+        for i, idx in enumerate(_iter):
+            if i < self.wrap_around % self.batch_size:
+                continue
+            if wrap_around:
+                self.wrap_around += 1
+                self.wrap_around %= self.batch_size
+            yield idx
+
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
+        return batch[start:end]
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/sp_tokenizer.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/sp_tokenizer.py
@@ -0,0 +1,140 @@
+# modified by Zhipu.Ai
+import os
+
+import json
+import sentencepiece as spm
+
+
+def get_pairs(word):
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class Encoder:
+
+    def __init__(self, encoder, bpe_merges):
+        self.encoder = encoder
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.max_len = 0
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        return [self.encoder.get(token, 1) for token in self.tokenize(text)]
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        return text
+
+    def tokenize(self, text):
+        bpe_tokens = []
+        bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.encoder.get(token, 1) for token in tokens]
+
+
+class Encoder_SP:
+
+    def __init__(self, model_path):
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+
+    def encode(self, text):
+        """
+        text="...."
+        """
+        return self.sp.EncodeAsIds(text)
+
+    def decode(self, tokens):
+        """
+        tokens=[x1,x2,...]
+        """
+        text = [int(token) for token in tokens]
+        # print(text)
+        return self.sp.DecodeIds(text)
+
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+
+
+def get_encoder(encoder_file, bpe_file):
+    filepath, filename = os.path.split(encoder_file)
+    shotname, extension = os.path.splitext(filename)
+
+    if ('.model' == extension) and (bpe_file == ''):
+        return Encoder_SP(encoder_file)
+    else:
+        with open(encoder_file, 'r', encoding='utf-8') as f:
+            encoder = json.load(f)
+        with open(bpe_file, 'r', encoding='utf-8') as f:
+            bpe_data = f.read()
+        bpe_merges = [
+            tuple(merge_str.split())
+            for merge_str in bpe_data.split('\n')[1:-1]
+        ]
+        return Encoder(
+            encoder=encoder,
+            bpe_merges=bpe_merges,
+        )
+
+
+def from_pretrained(model_path):
+    return get_encoder(
+        model_path + '/chinese_sentencepiece/cog-pretrain.model', '')
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/tf_dl.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/tf_dl.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DataLoader for TFRecords"""
+
+import queue
+import threading
+
+import numpy as np
+import tensorflow as tf
+import torch
+
+tf.enable_eager_execution()
+
+
+class TFRecordDataLoader(object):
+
+    def __init__(self,
+                 records,
+                 batch_size,
+                 max_seq_len,
+                 max_preds_per_seq,
+                 train,
+                 num_workers=2,
+                 seed=1,
+                 threaded_dl=False):
+        assert max_preds_per_seq is not None, '--max-preds-per-seq MUST BE SPECIFIED when using tfrecords'
+        tf.set_random_seed(seed)
+        if isinstance(records, str):
+            records = [records]
+
+        self.record_converter = Record2Example({
+            'input_ids':
+            tf.FixedLenFeature([max_seq_len], tf.int64),
+            'input_mask':
+            tf.FixedLenFeature([max_seq_len], tf.int64),
+            'segment_ids':
+            tf.FixedLenFeature([max_seq_len], tf.int64),
+            'masked_lm_positions':
+            tf.FixedLenFeature([max_preds_per_seq], tf.int64),
+            'masked_lm_ids':
+            tf.FixedLenFeature([max_preds_per_seq], tf.int64),
+            'masked_lm_weights':
+            tf.FixedLenFeature([max_preds_per_seq], tf.float32),
+            'next_sentence_labels':
+            tf.FixedLenFeature([1], tf.int64)
+        })
+
+        # Instantiate dataset according to original BERT implementation
+        if train:
+            self.dataset = tf.data.Dataset.from_tensor_slices(
+                tf.constant(records))
+            self.dataset = self.dataset.repeat()
+            self.dataset = self.dataset.shuffle(buffer_size=len(records))
+
+            # use sloppy tfrecord dataset
+            self.dataset = self.dataset.apply(
+                tf.contrib.data.parallel_interleave(
+                    tf.data.TFRecordDataset,
+                    sloppy=train,
+                    cycle_length=min(num_workers, len(records))))
+            self.dataset = self.dataset.shuffle(buffer_size=100)
+        else:
+            self.dataset = tf.data.TFRecordDataset(records)
+            self.dataset = self.dataset.repeat()
+
+        # Instantiate dataloader (do not drop remainder for eval)
+        loader_args = {
+            'batch_size': batch_size,
+            'num_parallel_batches': num_workers,
+            'drop_remainder': train
+        }
+        self.dataloader = self.dataset.apply(
+            tf.contrib.data.map_and_batch(self.record_converter,
+                                          **loader_args))
+        self.threaded_dl = threaded_dl
+        self.num_workers = num_workers
+
+    def __iter__(self):
+        if self.threaded_dl:
+            data_iter = iter(
+                MultiprocessLoader(self.dataloader, self.num_workers))
+            for item in data_iter:
+                yield item
+        else:
+            data_iter = iter(self.dataloader)
+            for item in data_iter:
+                yield convert_tf_example_to_torch_tensors(item)
+
+
+class Record2Example(object):
+
+    def __init__(self, feature_map):
+        self.feature_map = feature_map
+
+    def __call__(self, record):
+        """Decodes a BERT TF record to a TF example."""
+        example = tf.parse_single_example(record, self.feature_map)
+        for k, v in list(example.items()):
+            if v.dtype == tf.int64:
+                example[k] = tf.to_int32(v)
+        return example
+
+
+def convert_tf_example_to_torch_tensors(example):
+    item = {k: (v.numpy()) for k, v in example.items()}
+    mask = np.zeros_like(item['input_ids'])
+    mask_labels = np.ones_like(item['input_ids']) * -1
+    for b, row in enumerate(item['masked_lm_positions'].astype(int)):
+        for i, idx in enumerate(row):
+            if item['masked_lm_weights'][b, i] != 0:
+                mask[b, idx] = 1
+                mask_labels[b, idx] = item['masked_lm_ids'][b, i]
+    output = {
+        'text': item['input_ids'],
+        'types': item['segment_ids'],
+        'is_random': item['next_sentence_labels'],
+        'pad_mask': 1 - item['input_mask'],
+        'mask': mask,
+        'mask_labels': mask_labels
+    }
+    return {k: torch.from_numpy(v) for k, v in output.items()}
+
+
+class MultiprocessLoader(object):
+
+    def __init__(self, dataloader, num_workers=2):
+        self.dl = dataloader
+        self.queue_size = 2 * num_workers
+
+    def __iter__(self):
+        output_queue = queue.Queue(self.queue_size)
+        output_thread = threading.Thread(
+            target=_multiproc_iter, args=(self.dl, output_queue))
+        output_thread.daemon = True
+        output_thread.start()
+
+        while output_thread.is_alive():
+            yield output_queue.get(block=True)
+        else:
+            print(
+                RuntimeError(
+                    'TF record data loader thread exited unexpectedly'))
+
+
+def _multiproc_iter(dl, output_queue):
+    data_iter = iter(dl)
+    for item in data_iter:
+        tensors = convert_tf_example_to_torch_tensors(item)
+        output_queue.put(tensors, block=True)
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/tokenization.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/tokenization.py
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/tokenization_gpt2.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/tokenization_gpt2.py
@@ -0,0 +1,359 @@
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import os
+import sys
+from io import open
+
+import json
+import regex as re
+
+from .file_utils import cached_path
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    # 'gpt2': "/workspace/.pytorch_pretrained_bert/gpt2-vocab.json",
+    'gpt2': '.pytorch_pretrained_bert/gpt2-vocab.json',
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    # 'gpt2': "/workspace/.pytorch_pretrained_bert/gpt2-merges.txt",
+    'gpt2': '.pytorch_pretrained_bert/gpt2-merges.txt',
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path,
+                                      VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path,
+                                       MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path,
+                                               SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info('loading special tokens file {}'.format(
+                    special_tokens_file))
+        # redirect to the cache, if necessary
+        # try:
+        #     resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        #     resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        # except EnvironmentError:
+        #     logger.error(
+        #         "Model name '{}' was not found in model name list ({}). "
+        #         "We assumed '{}' was a path or url but couldn't find files {} and {} "
+        #         "at this path or url.".format(
+        #             pretrained_model_name_or_path,
+        #             ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+        #             pretrained_model_name_or_path,
+        #             vocab_file, merges_file))
+        #     return None
+        # if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+        #     logger.info("loading vocabulary file {}".format(vocab_file))
+        #     logger.info("loading merges file {}".format(merges_file))
+        # else:
+        #     logger.info("loading vocabulary file {} from cache at {}".format(
+        #         vocab_file, resolved_vocab_file))
+        #     logger.info("loading merges file {} from cache at {}".format(
+        #         merges_file, resolved_merges_file))
+        resolved_vocab_file = vocab_file
+        resolved_merges_file = merges_file
+        logger.info('loading vocabulary file {}'.format(vocab_file))
+        logger.info('loading merges file {}'.format(merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(
+                special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs)
+        return tokenizer
+
+    def __init__(self,
+                 vocab_file,
+                 merges_file,
+                 errors='replace',
+                 special_tokens=None,
+                 max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        )
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i)
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {
+            v: k
+            for k, v in self.special_tokens.items()
+        }
+        logger.info('Special tokens {}'.format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b]
+                                for b in token.encode('utf-8'))
+            bpe_tokens.extend(
+                bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2
+                                       and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                'Token indices sequence length is longer than the specified maximum '
+                ' sequence length for this OpenAI GPT model ({} > {}). Running this'
+                ' sequence through the model will result in indexing errors'.
+                format(len(ids), self.max_len))
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error('Vocabulary path ({}) should be a directory'.format(
+                vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, 'w', encoding='utf-8') as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(
+                    self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        'Saving vocabulary to {}: BPE merge indices are not consecutive.'
+                        ' Please check that the tokenizer is not corrupted!'.
+                        format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(
+                    self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        'Saving special tokens vocabulary to {}: BPE indices are not consecutive.'
+                        ' Please check that the tokenizer is not corrupted!'.
+                        format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
--- a/modelscope/models/nlp/txl_poem/gpt2/data_utils/wordpiece.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/data_utils/wordpiece.py
@@ -0,0 +1,408 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py"""  # noqa
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt',
+    'bert-large-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt',
+    'bert-base-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt',
+    'bert-large-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt',
+    'bert-base-multilingual-uncased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt',
+    'bert-base-multilingual-cased':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt',
+    'bert-base-chinese':
+    'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt',
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=True,
+                 max_len=None,
+                 do_basic_tokenize=True,
+                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+                .format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([
+            (ids, tok) for tok, ids in self.vocab.items()
+        ])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        if self.do_basic_tokenize:
+            split_tokens = []
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            logger.warning(
+                'Token indices sequence length is longer than the specified maximum '
+                ' sequence length for this BERT model ({} > {}). Running this'
+                ' sequence through BERT will result in indexing errors'.format(
+                    len(ids), self.max_len))
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
+                pretrained_model_name_or_path]
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                'associated to this path or url.'.format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info('loading vocabulary file {}'.format(vocab_file))
+        else:
+            logger.info('loading vocabulary file {} from cache at {}'.format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
+                pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  # noqa
+            (cp >= 0x3400 and cp <= 0x4DBF) or  # noqa
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  # noqa
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  # noqa
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  # noqa
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or  # noqa
+            (cp >= 0xF900 and cp <= 0xFAFF) or  # noqa
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  # noqa
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start > 0:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith('C'):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
+            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
--- a/modelscope/models/nlp/txl_poem/gpt2/fp16/init.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/fp16/init.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fp16 import *  # noqa
+from .fp16util import (BN_convert_float, FP16Model, clip_grad_norm,
+                       convert_module, convert_network,
+                       master_params_to_model_params,
+                       model_grads_to_master_grads, network_to_half,
+                       prep_param_lists, to_python_float, tofp16)
+from .loss_scaler import *  # noqa
--- a/modelscope/models/nlp/txl_poem/gpt2/fp16/fp16.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/fp16/fp16.py
@@ -0,0 +1,657 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Stable version of apex FP16 Optimizer"""
+import torch
+from torch import nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from .fp16util import (clip_grad_norm, master_params_to_model_params,
+                       model_grads_to_master_grads)
+from .loss_scaler import DynamicLossScaler, LossScaler
+
+FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, FLOAT_TYPES):
+            val = val.half()
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, HALF_TYPES):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class FP16_Module(nn.Module):
+
+    def __init__(self, module):
+        super(FP16_Module, self).__init__()
+        self.add_module('module', module.half())
+
+    def forward(self, *inputs, **kwargs):
+        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+
+
+# TODO:  Update overflow check + downscale to use Carl's fused kernel.
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
+    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
+    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
+    and changing the call to ``backward``.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary.
+
+    Args:
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`. # noqa
+        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate. # noqa
+        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option. # noqa
+        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. # noqa
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. # noqa
+
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
+    named to replace ``init_optimizer``, for two reasons:
+    First, it means that references to the same name
+    later in the file will not have to change.
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
+    modify ``init_optimizer``.  If you do choose a unique name for the new
+    :class:`FP16_Optimizer` instance, you should only work with this new instance,
+    because the preexisting optimizer might no longer behave as expected.
+
+    ``init_optimizer`` may be any Pytorch optimizer.
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
+    ingest these ``param_groups`` and remember them.
+
+    Calls to ::
+
+        loss.backward()
+
+    must be replaced with ::
+
+        optimizer.backward(loss)
+
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
+    loss scaling and copies to master gradients.
+
+    .. note::
+        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
+        are downscaled before being applied.  This means that adjusting the loss scale, or using
+        dynamic loss scaling, should not require retuning the learning rate or any other
+        hyperparameters.
+
+
+    **Advanced options**
+
+    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
+    See docstring for :attr:`step`.
+
+    **Gradient clipping**:  Use :attr:`clip_master_grads`.
+
+    **Multiple losses**:  If your model accumulates gradients from multiple losses,
+    this can be made more efficient by supplying ``update_master_grads=False``
+    to :attr:`backward`.  See docstring for :attr:`backward`.
+
+    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
+
+        print(optimizer.loss_scale)
+        optimizer.loss_scale = new_loss_scale
+
+    For static loss scaling, manually adjusting the loss scale over time is a reasonable
+    thing to do.  During later epochs, gradients may become smaller, and a
+    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
+    the loss scale is not recommended.
+
+    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
+    should still work as intended.
+    """ # noqa
+
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=False):
+        if not torch.cuda.is_available:
+            raise SystemError('Cannot use fp16 without CUDA.')
+
+        self.verbose = verbose
+
+        self.optimizer = init_optimizer
+        # init_state_dict sets up an alternative way to cast per-param state tensors.
+        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+        # init_state_dict = init_optimizer.state_dict()
+
+        self.fp16_groups = []
+        self.fp32_from_fp16_groups = []
+        self.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            self.maybe_print(
+                'FP16_Optimizer processing param group {}:'.format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.HalfTensor with {}'
+                            .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        # Copythe model parallel flag.
+                        master_param.model_parallel = param.model_parallel
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[
+                                master_param] = self.optimizer.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.FloatTensor with {}'
+                            .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError(
+                            'Wrapped parameters must be either '
+                            'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
+                            'Received {}'.format(param.type()))
+
+            self.fp16_groups.append(fp16_params_this_group)
+            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+        # alternative way to cast per-param state tensors:
+        # self.optimizer.load_state_dict(init_state_dict)
+
+        if dynamic_loss_scale:
+            self.dynamic_loss_scale = True
+            if dynamic_loss_args is not None:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+            else:
+                self.loss_scaler = DynamicLossScaler()
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(static_loss_scale)
+
+        self.overflow = False
+        self.first_closure_call_this_step = True
+
+        self.clip_grad_norm = clip_grad_norm
+
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
+
+    def __getstate__(self):
+        raise RuntimeError(
+            'FP16_Optimizer should be serialized using state_dict().')
+
+    def __setstate__(self, state):
+        raise RuntimeError(
+            'FP16_Optimizer should be deserialized using load_state_dict().')
+
+    def zero_grad(self, set_grads_to_None=False):
+        """
+        Zero fp32 and fp16 parameter grads.
+        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+        # Zero fp16 gradients owned by the model:
+        for fp16_group in self.fp16_groups:
+            for param in fp16_group:
+                if set_grads_to_None:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach_(
+                        )  # as in torch.optim.optimizer.zero_grad()
+                        param.grad.zero_()
+
+    def _check_overflow(self):
+        params = []
+        for group in self.fp16_groups:
+            for param in group:
+                params.append(param)
+        for group in self.fp32_from_fp32_groups:
+            for param in group:
+                params.append(param)
+        self.overflow = self.loss_scaler.has_overflow(params)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    def _master_params_to_model_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+    def _model_params_to_master_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
+
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    def _model_grads_to_master_grads(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+
+    def _downscale_master(self):
+        if self.loss_scale != 1.0:
+            for group in self.optimizer.param_groups:
+                for param in group['params']:
+                    if param.grad is not None:
+                        param.grad.data.mul_(1. / self.loss_scale)
+
+    def clip_master_grads(self, max_norm, norm_type=2):
+        """
+        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp32 gradients (viewed as a single vector).
+
+        .. warning::
+            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). # noqa
+        """ # noqa
+        if not self.overflow:
+            fp32_params = []
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    fp32_params.append(param)
+            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+        else:
+            return -1
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict[
+            'first_closure_call_this_step'] = self.first_closure_call_this_step
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+
+        Example::
+
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        self.first_closure_call_this_step = state_dict[
+            'first_closure_call_this_step']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current_group, saved_group in zip(self.fp32_from_fp16_groups,
+                                              state_dict['fp32_from_fp16']):
+            for current, saved in zip(current_group, saved_group):
+                current.data.copy_(saved.data)
+
+    def step(self, closure=None):  # could add clip option.
+        """
+        If no closure is supplied, :attr:`step` should be called after
+        ``fp16_optimizer_obj.backward(loss)``.
+        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+        another forward pass using their model.
+
+        If a closure is supplied, :attr:`step` may be called without a prior call to
+        :attr:`backward(loss)`.
+        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+        However, the user should take care that any ``loss.backward()`` call within the closure
+        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+
+        Args:
+           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. # noqa
+
+        Example with closure::
+
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
+            # existing pytorch optimizer.
+            for input, target in dataset:
+                def closure():
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                    # loss.backward() becomes:
+                    optimizer.backward(loss)
+                    return loss
+                optimizer.step(closure)
+
+        .. warning::
+            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+
+        .. _`ordinary Pytorch optimizer use`:
+            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+        """
+
+        scale = self.loss_scaler.loss_scale
+        self._update_scale(self.overflow)
+
+        if self.overflow:
+            self.maybe_print(
+                'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
+                .format(scale, self.loss_scale))
+            return
+
+        if closure is not None:
+            retval = self._step_with_closure(closure)
+        else:
+            retval = self.optimizer.step()
+
+        self._master_params_to_model_params()
+
+        return retval
+
+    def _step_with_closure(self, closure):
+
+        def wrapped_closure():
+            # helpful for debugging
+            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+            #       .format(self.first_closure_call_this_step))
+            if self.first_closure_call_this_step:
+                # We expect that the fp16 params are initially fresh on entering self.step(),
+                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+                # is called within self.optimizer.step().
+                self.first_closure_call_this_step = False
+            else:
+                # If self.optimizer.step() internally calls wrapped_closure more than once,
+                # it may update the fp32 params after each call.  However, self.optimizer
+                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+                # to handle that manually:
+                self._master_params_to_model_params()
+            # Our API expects the user to give us ownership of the backward() call by
+            # replacing all calls to loss.backward() with optimizer.backward(loss).
+            # This requirement holds whether or not the call to backward() is made within a closure.
+            # If the user is properly calling optimizer.backward(loss) within "closure,"
+            # calling closure() here will give the fp32 master params fresh gradients
+            # for the optimizer to play with, so all wrapped_closure needs to do is call
+            # closure() and return the loss.
+            temp_loss = closure()
+            while (self.overflow):
+                scale = self.loss_scaler.loss_scale
+                self._update_scale(self.overflow)
+                self.maybe_print(
+                    'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
+                    'reducing to {}'.format(scale, self.loss_scale))
+                temp_loss = closure()
+            return temp_loss
+
+        retval = self.optimizer.step(wrapped_closure)
+
+        self.first_closure_call_this_step = True
+
+        return retval
+
+    def backward(self, loss, update_master_grads=True, retain_graph=False):
+        """
+        :attr:`backward` performs the following conceptual steps:
+
+        1. fp32_loss = loss.float() (see first Note below)
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). # noqa
+        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. # noqa
+        5. Finally, master grads are divided by loss_scale.
+
+        In this way, after :attr:`backward`, the master params have fresh gradients,
+        and :attr:`step` may be called.
+
+        .. note::
+            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+            This provides some additional safety against overflow if the user has supplied an
+            fp16 loss value.
+            However, for maximum overflow safety, the user should
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
+            :attr:`backward`.
+
+        .. warning::
+            The gradients found in a model's leaves after the call to
+            :attr:`backward` should not be regarded as valid in general,
+            because it's possible
+            they have been scaled (and in the case of dynamic loss scaling,
+            the scale factor may change over time).
+            If the user wants to inspect gradients after a call to :attr:`backward`,
+            only the master gradients should be regarded as valid.  These can be retrieved via
+            :attr:`inspect_master_grad_data()`.
+
+        Args:
+            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
+            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
+
+        Example::
+
+            # Ordinary operation:
+            optimizer.backward(loss)
+
+            # Naive operation with multiple losses (technically valid, but less efficient):
+            # fp32 grads will be correct after the second call,  but
+            # the first call incurs an unnecessary fp16->fp32 grad copy.
+            optimizer.backward(loss1)
+            optimizer.backward(loss2)
+
+            # More efficient way to handle multiple losses:
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all
+            # losses have been accumulated.
+            optimizer.backward(loss1, update_master_grads=False)
+            optimizer.backward(loss2, update_master_grads=False)
+            optimizer.update_master_grads()
+        """ # noqa
+        # To consider:  try multiple backward passes using retain_grad=True to find
+        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if update_master_grads:
+            self.update_master_grads()
+
+    def update_master_grads(self):
+        """
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to
+        the ``.grad`` attribute of the fp32 master parameters that are directly
+        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+        """
+        if self.dynamic_loss_scale:
+            self._check_overflow()
+            if self.overflow: return  # noqa
+        self._model_grads_to_master_grads()
+        self._downscale_master()
+
+    def inspect_master_grad_data(self):
+        """
+        When running with :class:`FP16_Optimizer`,
+        ``.grad`` attributes of a model's fp16 leaves should not be
+        regarded as truthful, because they might be scaled.
+        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+        the fp32 master params' ``.grad``
+        attributes will contain valid gradients properly divided by the loss scale.  However,
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
+        nonintuitive.  :attr:`inspect_master_grad_data`
+        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+
+        Returns:
+            List of lists (one list for each parameter group).  The list for each parameter group
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
+        """
+        if self.overflow:
+            print(
+                'Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  '
+                'Gradients are currently invalid (may be inf, nan, or stale).  Returning None.'
+            )
+            return None
+        else:
+            # The optimizer owns only references to master params.
+            master_grads_data = []
+            for param_group in self.optimizer.param_groups:
+                master_grads_this_group = []
+                for param in param_group['params']:
+                    if param.grad is not None:
+                        master_grads_this_group.append(param.grad.data)
+                    else:
+                        master_grads_this_group.append(None)
+                master_grads_data.append(master_grads_this_group)
+            return master_grads_data
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
--- a/modelscope/models/nlp/txl_poem/gpt2/fp16/fp16util.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/fp16/fp16util.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+
+from .. import mpu
+
+
+class tofp16(nn.Module):
+    """
+    Utility module that implements::
+
+        def forward(self, input):
+            return input.half()
+    """
+
+    def __init__(self):
+        super(tofp16, self).__init__()
+
+    def forward(self, input):
+        return input.half()
+
+
+def BN_convert_float(module):
+    """
+    Utility function for network_to_half().
+
+    Retained for legacy purposes.
+    """
+    if isinstance(
+            module,
+            torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+        module.float()
+    for child in module.children():
+        BN_convert_float(child)
+    return module
+
+
+def network_to_half(network):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+
+    Retained for legacy purposes. It is recommended to use FP16Model.
+    """
+    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
+
+
+def convert_module(module, dtype):
+    """
+    Converts a module's immediate parameters and buffers to dtype.
+    """
+    for param in module.parameters(recurse=False):
+        if param is not None:
+            if param.data.dtype.is_floating_point:
+                param.data = param.data.to(dtype=dtype)
+            if param._grad is not None and param._grad.data.dtype.is_floating_point:
+                param._grad.data = param._grad.data.to(dtype=dtype)
+
+    for buf in module.buffers(recurse=False):
+        if buf is not None and buf.data.dtype.is_floating_point:
+            buf.data = buf.data.to(dtype=dtype)
+
+
+def convert_network(network, dtype):
+    """
+    Converts a network's parameters and buffers to dtype.
+    """
+    for module in network.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
+                      ) and module.affine is True:
+            continue
+        convert_module(module, dtype)
+    return network
+
+
+class FP16Model(nn.Module):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+    """
+
+    def __init__(self, network):
+        super(FP16Model, self).__init__()
+        self.network = convert_network(network, dtype=torch.half)
+
+    def forward(self, *inputs):
+        inputs = tuple(t.half() for t in inputs)
+        return self.network(*inputs)
+
+
+def backwards_debug_hook(grad):
+    raise RuntimeError(
+        'master_params recieved a gradient in the backward pass!')
+
+
+def prep_param_lists(model, flat_master=False):
+    """
+    Creates a list of FP32 master parameters for a given model, as in
+    `Training Neural Networks with Mixed Precision:  Real Examples`_.
+
+    Args:
+        model (torch.nn.Module): Existing Pytorch model
+        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization. # noqa
+    Returns:
+        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element. # noqa
+
+    Example::
+
+        model_params, master_params = prep_param_lists(model)
+
+    .. warning::
+        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. # noqa
+
+    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
+        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+    """ # noqa
+    model_params = [
+        param for param in model.parameters() if param.requires_grad
+    ]
+
+    if flat_master:
+        # Give the user some more useful error messages
+        try:
+            # flatten_dense_tensors returns a contiguous flat array.
+            # http://pytorch.org/docs/master/_modules/torch/_utils.html
+            master_params = _flatten_dense_tensors(
+                [param.data for param in model_params]).float()
+        except:  # noqa
+            print(
+                'Error in prep_param_lists:  model may contain a mixture of parameters '
+                'of different types.  Use flat_master=False, or use F16_Optimizer.'
+            )
+            raise
+        master_params = torch.nn.Parameter(master_params)
+        master_params.requires_grad = True
+        # master_params.register_hook(backwards_debug_hook)
+        if master_params.grad is None:
+            master_params.grad = master_params.new(*master_params.size())
+        return model_params, [master_params]
+    else:
+        master_params = [
+            param.clone().float().detach() for param in model_params
+        ]
+        for param in master_params:
+            param.requires_grad = True
+        return model_params, master_params
+
+
+def model_grads_to_master_grads(model_params,
+                                master_params,
+                                flat_master=False):
+    """
+    Copy model gradients to master gradients.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. # noqa
+    """ # noqa
+    if flat_master:
+        # The flattening may incur one more deep copy than is necessary.
+        master_params[0].grad.data.copy_(
+            _flatten_dense_tensors([p.grad.data for p in model_params]))
+    else:
+        for model, master in zip(model_params, master_params):
+            if model.grad is not None:
+                if master.grad is None:
+                    master.grad = Variable(
+                        master.data.new(*master.data.size()))
+                master.grad.data.copy_(model.grad.data)
+            else:
+                master.grad = None
+
+
+def master_params_to_model_params(model_params,
+                                  master_params,
+                                  flat_master=False):
+    """
+    Copy master parameters to model parameters.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. # noqa
+    """ # noqa
+    if flat_master:
+        for model, master in zip(
+                model_params,
+                _unflatten_dense_tensors(master_params[0].data, model_params)):
+            model.data.copy_(master)
+    else:
+        for model, master in zip(model_params, master_params):
+            model.data.copy_(master.data)
+
+
+# Backward compatibility fixes
+
+
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+clip_grad_norm = mpu.clip_grad_norm
--- a/modelscope/models/nlp/txl_poem/gpt2/fp16/loss_scaler.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/fp16/loss_scaler.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .. import mpu
+
+
+# item() is a recent addition, so this helps with backward compatibility.
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+class LossScaler:
+    """
+    Class that manages a static loss scale.  This class is intended to interact with
+    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
+
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
+    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
+    """
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+class DynamicLossScaler:
+    """
+    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
+    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
+    operates, because the default options can be changed using the
+    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
+
+    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
+    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
+    occurred.
+    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients detected,
+    :class:`DynamicLossScaler` increases the loss scale once more.
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
+    always using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. # noqa
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale. # noqa
+    """ # noqa
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000,
+                 min_scale=1,
+                 delayed_shift=1,
+                 consecutive_hysteresis=False):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.min_scale = min_scale
+        self.delayed_shift = delayed_shift
+        self.cur_hysteresis = delayed_shift
+        self.consecutive_hysteresis = consecutive_hysteresis
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params):
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
+                    p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        torch.distributed.all_reduce(
+            overflow_gpu,
+            op=torch.distributed.ReduceOp.MAX,
+            group=mpu.get_model_parallel_group())
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float(
+                    'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    # `overflow` is boolean indicating whether the gradient overflowed
+    def update_scale(self, overflow):
+
+        if not hasattr(self, 'min_scale'):
+            self.min_scale = 1
+        if not hasattr(self, 'delayed_shift'):
+            self.delayed_shift = 1
+        if not hasattr(self, 'cur_hysteresis'):
+            self.cur_hysteresis = 1
+        if not hasattr(self, 'consecutive_hysteresis'):
+            self.consecutive_hysteresis = True
+        if overflow:
+            # self.cur_scale /= self.scale_factor
+            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
+                self.cur_scale = max(self.cur_scale / self.scale_factor,
+                                     self.min_scale)
+            else:
+                self.cur_hysteresis -= 1
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if self.consecutive_hysteresis:
+                self.cur_hysteresis = self.delayed_shift
+            if (self.cur_iter
+                    - self.last_overflow_iter) % self.scale_window == 0:
+                if not self.consecutive_hysteresis:
+                    self.cur_hysteresis = self.delayed_shift
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+##############################################################
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+"""
+TO-DO separate out into an example.
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
+"""
--- a/modelscope/models/nlp/txl_poem/gpt2/model/init.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/model/init.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .distributed import *  # noqa
+from .gpt2_modeling import (GPT2Model,
+                            gpt2_get_params_for_weight_decay_optimization)
+from .model import BertModel, get_params_for_weight_decay_optimization
--- a/modelscope/models/nlp/txl_poem/gpt2/model/distributed.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/model/distributed.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.modules import Module
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+
+from .. import mpu
+
+
+class PyTorchDistributedDataParallel(DDP):
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+
+
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+        self.data_parallel_group = mpu.get_data_parallel_group()
+        src_rank = mpu.get_model_parallel_rank()
+        for p in self.module.parameters():
+            if torch.is_tensor(p):
+                dist.broadcast(p, src_rank, group=self.data_parallel_group)
+
+        def allreduce_params(reduce_after=True,
+                             no_scale=False,
+                             fp32_allreduce=False):
+            if (self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for name, param in self.module.named_parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = (param.data.type())
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print(
+                            'WARNING: gloo dist backend for half parameters may be extremely slow. It is recommended to use the NCCL backend in this case.'  # noqa
+                        )  # noqa
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    if fp32_allreduce:
+                        coalesced = coalesced.float()
+                    if not no_scale and not reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
+                    torch.cuda.synchronize()
+                    if not no_scale and reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    for buf, synced in zip(
+                            grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        self.hook_handles = []
+        self.hooks = []
+        for param in list(self.module.parameters()):
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+
+        self.allreduce_params = allreduce_params
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+    def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
--- a/modelscope/models/nlp/txl_poem/gpt2/model/gpt2_modeling.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/model/gpt2_modeling.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT-2 model."""
+
+import torch
+import torch.nn.functional as F
+
+from .. import mpu
+
+
+def init_method_normal(std=0.02):
+    """Init method based on normal distribution.
+
+    This is only used for embeddings. The transformer has its
+    own initializer.
+    """
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GPT2Model(torch.nn.Module):
+    """GPT-2 Language model.
+
+    The output of the forward method are the logits (parallel or
+    serial depending on the `parallel_output` flag.
+    """
+
+    def __init__(self,
+                 num_layers,
+                 vocab_size,
+                 hidden_size,
+                 num_attention_heads,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 max_sequence_length,
+                 max_memory_length,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 parallel_output=True,
+                 relative_encoding=False):
+
+        super(GPT2Model, self).__init__()
+
+        self.parallel_output = parallel_output
+
+        init_method = init_method_normal(std=0.02)
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, hidden_size, init_method=init_method)
+
+        # Transformer
+        self.transformer = mpu.GPT2ParallelTransformer(
+            num_layers,
+            hidden_size,
+            num_attention_heads,
+            max_sequence_length,
+            max_memory_length,
+            embedding_dropout_prob,
+            attention_dropout_prob,
+            output_dropout_prob,
+            checkpoint_activations,
+            checkpoint_num_layers,
+            relative_encoding=relative_encoding)
+
+    def forward(self, input_ids, position_ids, attention_mask, *mems):
+
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+
+        # Transformer.
+        transformer_output = self.transformer(embeddings, position_ids,
+                                              attention_mask, *mems)
+        logits, *hidden_layers = transformer_output
+        # Parallel logits.
+        logits_parallel = mpu.copy_to_model_parallel_region(logits)
+        logits_parallel = F.linear(logits_parallel,
+                                   self.word_embeddings.weight)
+
+        if self.parallel_output:
+            return (logits_parallel, *hidden_layers)
+
+        return (mpu.gather_from_model_parallel_region(logits_parallel),
+                *hidden_layers)
+
+
+def gpt2_get_params_for_weight_decay_optimization(module):
+
+    weight_decay_params = {'params': []}
+    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    for module_ in module.modules():
+        if isinstance(module_, (mpu.LayerNorm, torch.nn.LayerNorm)):
+            no_weight_decay_params['params'].extend([
+                p for p in list(module_._parameters.values()) if p is not None
+            ])
+        else:
+            weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and n != 'bias'
+            ])
+            no_weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and n == 'bias'
+            ])
+
+    return weight_decay_params, no_weight_decay_params
--- a/modelscope/models/nlp/txl_poem/gpt2/model/model.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/model/model.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for wrapping BertModel."""
+
+import torch
+
+from .modeling import (BertConfig, BertForMaskedLM, BertForPreTraining,
+                       BertLayerNorm)
+
+
+def get_params_for_weight_decay_optimization(module):
+
+    weight_decay_params = {'params': []}
+    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    for module_ in module.modules():
+        if isinstance(module_, (BertLayerNorm, torch.nn.LayerNorm)):
+            no_weight_decay_params['params'].extend([
+                p for p in list(module_._parameters.values()) if p is not None
+            ])
+        else:
+            weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and n != 'bias'
+            ])
+            no_weight_decay_params['params'].extend([
+                p for n, p in list(module_._parameters.items())
+                if p is not None and n == 'bias'
+            ])
+
+    return weight_decay_params, no_weight_decay_params
+
+
+class BertModel(torch.nn.Module):
+
+    def __init__(self, args):
+        super(BertModel, self).__init__()
+        if args.pretrained_bert:
+            self.model = BertForPreTraining.from_pretrained(
+                args.tokenizer_model_type,
+                cache_dir=args.cache_dir,
+                fp32_layernorm=args.fp32_layernorm,
+                fp32_embedding=args.fp32_embedding,
+                layernorm_epsilon=args.layernorm_epsilon)
+        else:
+            if args.intermediate_size is None:
+                intermediate_size = 4 * args.hidden_size
+            else:
+                intermediate_size = args.intermediate_size
+            self.config = BertConfig(
+                args.tokenizer_num_tokens,
+                hidden_size=args.hidden_size,
+                num_hidden_layers=args.num_layers,
+                num_attention_heads=args.num_attention_heads,
+                intermediate_size=intermediate_size,
+                hidden_dropout_prob=args.hidden_dropout,
+                attention_probs_dropout_prob=args.attention_dropout,
+                max_position_embeddings=args.max_position_embeddings,
+                type_vocab_size=args.tokenizer_num_type_tokens,
+                fp32_layernorm=args.fp32_layernorm,
+                fp32_embedding=args.fp32_embedding,
+                fp32_tokentypes=args.fp32_tokentypes,
+                layernorm_epsilon=args.layernorm_epsilon,
+                deep_init=args.deep_init)
+            self.model = BertForPreTraining(self.config)
+
+    def forward(self,
+                input_tokens,
+                token_type_ids=None,
+                attention_mask=None,
+                checkpoint_activations=False):
+        return self.model(
+            input_tokens,
+            token_type_ids,
+            attention_mask,
+            checkpoint_activations=checkpoint_activations)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.model.state_dict(
+            destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
--- a/modelscope/models/nlp/txl_poem/gpt2/model/modeling.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/model/modeling.py
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/init.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/init.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model parallel utility interface."""
+
+from .cross_entropy import vocab_parallel_cross_entropy
+from .data import broadcast_data
+from .grads import clip_grad_norm
+from .initialize import (destroy_model_parallel, get_data_parallel_group,
+                         get_data_parallel_rank, get_data_parallel_world_size,
+                         get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_src_rank,
+                         get_model_parallel_world_size,
+                         initialize_model_parallel,
+                         model_parallel_is_initialized)
+from .layers import (ColumnParallelLinear, ParallelEmbedding,
+                     RowParallelLinear, VocabParallelEmbedding)
+from .mappings import (copy_to_model_parallel_region,
+                       gather_from_model_parallel_region,
+                       reduce_from_model_parallel_region,
+                       scatter_to_model_parallel_region)
+from .random import (checkpoint, get_cuda_rng_tracker,
+                     model_parallel_cuda_manual_seed,
+                     partition_activations_in_checkpoint)
+from .transformer import (BertParallelSelfAttention,
+                          BertParallelTransformerLayer,
+                          GPT2ParallelTransformer, LayerNorm)
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/bak_trans
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/bak_trans
@@ -0,0 +1,823 @@
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer."""
+
+import math
+
+import torch
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+from .initialize import get_model_parallel_world_size
+from .layers import ColumnParallelLinear
+from .layers import RowParallelLinear
+from .mappings import gather_from_model_parallel_region
+
+import deepspeed
+
+from .random import checkpoint
+from .random import get_cuda_rng_tracker
+
+from .utils import divide
+from .utils import split_tensor_along_last_dim
+
+
+class PositionalEmbedding(torch.nn.Module):
+    def __init__(self, hidden_size):
+        super(PositionalEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+
+
+class GPT2ParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_dropout_prob, output_dropout_prob,
+                 init_method, output_layer_init_method=None, relative_encoding=False):
+        super(GPT2ParallelSelfAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads,
+                                                        world_size)
+        self.relative_encoding = relative_encoding
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(hidden_size, 3 * hidden_size,
+                                                    stride=3,
+                                                    gather_output=False,
+                                                    init_method=init_method)
+        if relative_encoding:
+            self.relative = ColumnParallelLinear(hidden_size, hidden_size, gather_output=False,
+                                                 init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(hidden_size,
+                                       hidden_size,
+                                       input_is_parallel=True,
+                                       init_method=output_layer_init_method)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition,
+                            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    @staticmethod
+    def _rel_shift(x, zero_triu=False):
+        # ql x kl x bsz x h
+        # bsz x h x ql x kl
+        zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1),
+                               device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2))
+
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None]
+
+        return x
+
+    @staticmethod
+    def _rel_shift_latest(x: torch.Tensor):
+        ndims = x.dim()
+        x_shape = x.size()
+        row_dim = 2
+        col_dim = row_dim + 1
+        assert col_dim < ndims
+        tgt_shape_1, tgt_shape_2 = [], []
+        for i in range(ndims):
+            if i == row_dim:
+                tgt_shape_1.append(x_shape[col_dim])
+                tgt_shape_2.append(x_shape[row_dim])
+            elif i == col_dim:
+                tgt_shape_1.append(x_shape[row_dim])
+                tgt_shape_2.append(x_shape[col_dim] - 1)
+            else:
+                tgt_shape_1.append(x_shape[i])
+                tgt_shape_2.append(x_shape[i])
+        x = x.view(*tgt_shape_1)
+        x = x[:, :, 1:, :]
+        x = x.view(*tgt_shape_2)
+        return x
+
+    def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        query_length = hidden_states.size(1)
+
+        if mem is None:
+            mixed_x_layer = self.query_key_value(hidden_states)
+            (mixed_query_layer,
+             mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+        else:
+            cat = torch.cat((mem, hidden_states), 1)
+            mixed_x_layer = self.query_key_value(cat)
+            (mixed_query_layer,
+             mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+            mixed_query_layer = mixed_query_layer[:, -query_length:]
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        if self.relative_encoding:
+            relative_layer = self.relative(position_embeddings)
+            relative_layer = self._transpose_for_scores(relative_layer)  # 1 (bsz) x n_head x klen x d_head
+            # Raw attention scores. [b, np, qs, ks]
+            rw_head_q = query_layer + r_w_bias.unsqueeze(1)
+            ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2))
+            rr_head_q = query_layer + r_r_bias.unsqueeze(1)
+            bd_score = torch.matmul(rr_head_q, relative_layer.transpose(-1, -2))
+            bd_score = self._rel_shift(bd_score)  # qlen x klen x bsz x n_head
+            # bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen
+
+            attention_scores = ac_score + bd_score
+        else:
+            # Raw attention scores. [b, np, s, s]
+            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+        # Apply the left to right attention mask.
+        attention_scores = torch.mul(attention_scores, ltor_mask) - \
+                           10000.0 * (1.0 - ltor_mask)
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+
+        return output
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class GPT2ParallelMLP(torch.nn.Module):
+    """MLP for GPT2.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layer initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self, hidden_size, output_dropout_prob, init_method,
+                 output_layer_init_method=None):
+        super(GPT2ParallelMLP, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Project to 4h.
+        self.dense_h_to_4h = ColumnParallelLinear(hidden_size, 4 * hidden_size,
+                                                  gather_output=False,
+                                                  init_method=init_method)
+        # Project back to h.
+        self.dense_4h_to_h = RowParallelLinear(
+            4 * hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def forward(self, hidden_states):
+        # [b, s, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = gelu(intermediate_parallel)
+
+        # [b, s, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        output = self.dropout(output)
+        return output
+
+
+class GPT2ParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False):
+        super(GPT2ParallelTransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = torch.nn.LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = GPT2ParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            relative_encoding=relative_encoding)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = LayerNorm(hidden_size,
+                                                  eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = GPT2ParallelMLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+    def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask, position_embeddings, r_w_bias, r_r_bias, mem)
+        # Residual connection.
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GPT2ParallelTransformer(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+
+    def __init__(self,
+                 num_layers,
+                 hidden_size,
+                 num_attention_heads,
+                 max_sequence_length,
+                 max_memory_length,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 layernorm_epsilon=1.0e-5,
+                 init_method_std=0.02,
+                 use_scaled_init_for_output_weights=True,
+                 relative_encoding=False):
+        super(GPT2ParallelTransformer, self).__init__()
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.max_memory_length = max_memory_length
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(init_method_std,
+                                                          num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.relative_encoding = relative_encoding
+        if relative_encoding:
+            # Relative position embedding
+            self.position_embeddings = PositionalEmbedding(hidden_size)
+            # Per attention head and per partition values.
+            world_size = get_model_parallel_world_size()
+            self.hidden_size_per_attention_head = divide(hidden_size,
+                                                         num_attention_heads)
+            self.num_attention_heads_per_partition = divide(num_attention_heads,
+                                                            world_size)
+            self.r_w_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
+            self.r_w_bias.model_parallel = True
+            self.r_r_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
+            self.r_r_bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.r_w_bias.zero_()
+                self.r_r_bias.zero_()
+        else:
+            # Position embedding (serial).
+            self.position_embeddings = torch.nn.Embedding(max_sequence_length,
+                                                          hidden_size)
+            # Initialize the position embeddings.
+            torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+
+        def get_layer():
+            return GPT2ParallelTransformerLayer(
+                hidden_size,
+                num_attention_heads,
+                attention_dropout_prob,
+                output_dropout_prob,
+                layernorm_epsilon,
+                unscaled_init_method(init_method_std),
+                output_layer_init_method=output_layer_init_method,
+                relative_encoding=relative_encoding)
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def forward(self, hidden_states, position_ids, attention_mask, *mems):
+        batch_size, query_length = hidden_states.size()[:2]
+        memory_length = mems[0].size(1) if mems else 0
+        key_length = query_length + memory_length
+        attention_mask = attention_mask[:, :, :, -query_length - memory_length:]
+        if self.relative_encoding:
+            hidden_states = self.embedding_dropout(hidden_states)
+            position_sequence = torch.arange(key_length - 1, -1, -1.0, device=hidden_states.device,
+                                             dtype=hidden_states.dtype)
+            position_embeddings = self.position_embeddings(position_sequence)
+            # Apply dropout
+            position_embeddings = self.embedding_dropout(position_embeddings)
+            hidden_states = self.embedding_dropout(hidden_states)
+        else:
+            position_embeddings = self.position_embeddings(position_ids)
+            hidden_states = hidden_states + position_embeddings
+            hidden_states = self.embedding_dropout(hidden_states)
+        if self.max_memory_length > 0:
+            mem_layers = [hidden_states.detach()]
+        else:
+            mem_layers = []
+
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers_ = self.layers[start:end]
+                x_, inputs = inputs[0], inputs[1:]
+                if self.relative_encoding:
+                    inputs, mems_ = inputs[:4], inputs[4:]
+                else:
+                    inputs, mems_ = inputs[:1], inputs[1:]
+                for i, layer in enumerate(layers_):
+                    mem_i_ = mems_[i] if mems_ else None
+                    x_ = layer(x_, *inputs, mem=mem_i_)
+                    if self.max_memory_length > 0:
+                        mem_layers.append(x_.detach())
+                return x_
+
+            return custom_forward
+
+        if self.checkpoint_activations:
+            l = 0
+            num_layers = len(self.layers)
+            chunk_length = self.checkpoint_num_layers
+            while l < num_layers:
+                args = [hidden_states, attention_mask]
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                if mems:
+                    args += mems[l: l + chunk_length]
+                hidden_states = checkpoint(custom(l, l + chunk_length), *args)
+                l += chunk_length
+        else:
+            for i, layer in enumerate(self.layers):
+                args = [hidden_states, attention_mask]
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                mem_i = mems[i] if mems else None
+                hidden_states = layer(*args, mem=mem_i)
+                if self.max_memory_length > 0:
+                    mem_layers.append(hidden_states.detach())
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if self.max_memory_length > 0:
+            mem_layers = self.update_mems(mem_layers, mems)
+
+        return (output, *mem_layers)
+
+    def update_mems(self, hiddens, mems):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = min(self.max_memory_length, memory_length + query_length)
+        new_mems = []
+        with torch.no_grad():
+            for i in range(len(hiddens)):
+                if new_memory_length <= query_length:
+                    new_mems.append(hiddens[i][:, -new_memory_length:])
+                else:
+                    new_mems.append(torch.cat((mems[i][:, -new_memory_length + query_length:], hiddens[i]), dim=1))
+        return new_mems
+
+
+class BertParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for BERT.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        output_parallel: If true, no all-gather is done on the output and
+                         the output values will be per partition.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self, hidden_size, num_attention_heads,
+                 dropout_prob, output_parallel=False,
+                 init_method=init.xavier_normal_):
+        super(BertParallelSelfAttention, self).__init__()
+        # Input configuration.
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.dropout_prob = dropout_prob
+        self.output_parallel = output_parallel
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(num_attention_heads,
+                                                        world_size)
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(hidden_size, 3 * hidden_size,
+                                                    stride=3,
+                                                    gather_output=False,
+                                                    init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.dropout = torch.nn.Dropout(dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition,
+                            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+
+        # Attention heads. [b, s, hp]
+        mixed_x_layer = self.query_key_value(hidden_states)
+        (mixed_query_layer,
+         mixed_key_layer,
+         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+
+        # Raw attention scores. [b, np, s, s]
+        norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
+        attention_scores = torch.matmul(query_layer / norm_factor,
+                                        key_layer.transpose(-1, -2) / norm_factor)
+        # Apply the attention mask.
+        attention_scores += attention_mask
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,)
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        if self.output_parallel:
+            output = context_layer
+        else:
+            output = gather_from_model_parallel_region(context_layer)
+
+        return output
+
+
+class BertParallelTransformerOutput(torch.nn.Module):
+    """The output layer used after self attention and intermediate
+    parts of transformer layer."""
+
+    def __init__(self, input_size, output_size, dropout_prob,
+                 layernorm_epsilon=1.0e-12, input_is_parallel=False,
+                 init_method=init.xavier_normal_):
+        super(BertParallelTransformerOutput, self).__init__()
+        # Components.
+        self.dense = RowParallelLinear(input_size,
+                                       output_size,
+                                       input_is_parallel=input_is_parallel,
+                                       init_method=init_method)
+        self.dropout = torch.nn.Dropout(dropout_prob)
+        self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        layernorm_input = hidden_states + input_tensor
+        hidden_states = self.layernorm(layernorm_input)
+        return hidden_states
+
+
+class BertParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for Bert.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        intermediate_size: size of the intermediate state after
+                           self attention. In both BERT and GPT
+                           this is set to be 4 times the hidden
+                           size.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        intermediate_activation_fn: activation function for output
+                                    of intermediate.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 intermediate_activation_fn,
+                 layernorm_epsilon,
+                 init_method=init.xavier_normal_):
+        super(BertParallelTransformerLayer, self).__init__()
+
+        # Self attention.
+        self.attention = BertParallelSelfAttention(hidden_size,
+                                                   num_attention_heads,
+                                                   attention_dropout_prob,
+                                                   output_parallel=True,
+                                                   init_method=init_method)
+        # Self attention output.
+        self.self_output = BertParallelTransformerOutput(
+            hidden_size, hidden_size, output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method)
+        # Intermediate.
+        self.intermediate = ColumnParallelLinear(hidden_size, intermediate_size,
+                                                 gather_output=False,
+                                                 init_method=init_method)
+        self.intermediate_activation_fn = intermediate_activation_fn
+        # Output.
+        self.output = BertParallelTransformerOutput(
+            intermediate_size, hidden_size, output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method)
+
+    def forward(self, hidden_states, attention_mask):
+        # [b, s, hp]
+        attention_output_parallel = self.attention(hidden_states,
+                                                   attention_mask)
+        # [b, s, h]
+        attention_self_output = self.self_output(attention_output_parallel,
+                                                 hidden_states)
+        # [b, s, ip]
+        intermediate_output_parallel = self.intermediate(attention_self_output)
+        intermediate_output_parallel = self.intermediate_activation_fn(
+            intermediate_output_parallel)
+        # [b, s, h]
+        layer_output = self.output(intermediate_output_parallel,
+                                   attention_self_output)
+
+        return layer_output
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/cross_entropy.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/cross_entropy.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_world_size)
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Copy so the input remains unchanged.
+        logits = vocab_parallel_logits.clone()
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(logits, dim=-1)[0]
+        torch.distributed.all_reduce(
+            logits_max,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group())
+        # Subtract the maximum value.
+        logits.sub_(logits_max.unsqueeze(dim=-1))
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = logits.exp()
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(
+            sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_model_parallel_rank()
+        world_size = get_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (
+            target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(
+            start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(
+            predicted_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(
+            start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d,
+                masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/data.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/data.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import (get_model_parallel_group, get_model_parallel_rank,
+                         get_model_parallel_src_rank)
+
+_MAX_DATA_DIM = 4
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, '{} has data type {} which '\
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim(
+            ) < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda,
+        get_model_parallel_src_rank(),
+        group=get_model_parallel_group())
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(
+        keys, data)
+
+    # Pack on rank zero.
+    if get_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(
+            total_numel, device=torch.cuda.current_device(), dtype=datatype)
+
+    # Boradcast
+    torch.distributed.broadcast(
+        flatten_data,
+        get_model_parallel_src_rank(),
+        group=get_model_parallel_group())
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/grads.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/grads.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import torch
+from torch._six import inf
+
+from .initialize import get_model_parallel_group, get_model_parallel_rank
+
+
+def clip_grad_norm(parameters, max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in parameters)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all GPUs.
+        torch.distributed.all_reduce(
+            total_norm_cuda,
+            op=torch.distributed.ReduceOp.MAX,
+            group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+    else:
+        total_norm = 0
+        for p in parameters:
+            if p.model_parallel or (get_model_parallel_rank() == 0):
+                param_norm = p.grad.data.norm(norm_type)
+                total_norm += param_norm.item()**norm_type
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        torch.distributed.all_reduce(
+            total_norm_cuda,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        for p in parameters:
+            p.grad.data.mul_(clip_coef)
+    return total_norm
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/initialize.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/initialize.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+# Model parallel group that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+
+def initialize_model_parallel(model_parallel_size_):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        model_parallel_size: number of GPUs used to parallelize model.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model. The present function will
+    create 4 model parallel groups and 2 data parallel grous as:
+        4 model parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 data parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel with size {}'.format(
+            model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = min(model_parallel_size_, world_size)
+    ensure_divisibility(world_size, model_parallel_size)
+    rank = torch.distributed.get_rank()
+
+    # Build the data parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, \
+        'data parallel group is already initialized'
+    for i in range(model_parallel_size):
+        ranks = range(i, world_size, model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank % model_parallel_size):
+            _DATA_PARALLEL_GROUP = group
+
+    # Build the model parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, \
+        'model parallel group is already initialized'
+    for i in range(world_size // model_parallel_size):
+        ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if i == (rank // model_parallel_size):
+            _MODEL_PARALLEL_GROUP = group
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_model_parallel_world_size():
+    """Return world size for the model parallel group."""
+    return torch.distributed.get_world_size(group=get_model_parallel_group())
+
+
+def get_model_parallel_rank():
+    """Return my rank for the model parallel group."""
+    return torch.distributed.get_rank(group=get_model_parallel_group())
+
+
+def get_model_parallel_src_rank():
+    """Calculate the global rank corresponding to a local rank zeor
+    in the model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/layers.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/layers.py
@@ -0,0 +1,358 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+from torch.nn.parameter import Parameter
+
+from .initialize import get_model_parallel_rank, get_model_parallel_world_size
+from .mappings import (copy_to_model_parallel_region,
+                       gather_from_model_parallel_region,
+                       reduce_from_model_parallel_region,
+                       scatter_to_model_parallel_region)
+from .random import get_cuda_rng_tracker
+from .utils import VocabUtility, divide, split_tensor_along_last_dim
+
+
+def _initialize_affine_weight(weight,
+                              output_size,
+                              input_size,
+                              per_partition_size,
+                              partition_dim,
+                              init_method,
+                              stride=1,
+                              return_master_weight=False):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+    # If we only use 1 process for model parallelism, bypass scatter.
+    world_size = get_model_parallel_world_size()
+    if world_size == 1:
+        init_method(weight)
+        if return_master_weight:
+            return weight
+        return None
+
+    # Initialize master weight
+    master_weight = torch.empty(
+        output_size, input_size, dtype=weight.dtype, requires_grad=False)
+    init_method(master_weight)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(
+        master_weight, per_partition_per_stride_size, dim=partition_dim)
+    rank = get_model_parallel_rank()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 init_method=init.xavier_normal_):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_model_parallel_rank(),
+                get_model_parallel_world_size())
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+                                            self.vocab_start_index # noqa
+
+        # Allocate weights.
+        self.weight = Parameter(
+            torch.Tensor(self.num_embeddings_per_partition,
+                         self.embedding_dim))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(self.weight, self.num_embeddings,
+                                  self.embedding_dim,
+                                  self.num_embeddings_per_partition, 0,
+                                  init_method)
+
+    def forward(self, input_):
+        # Build the mask.
+        input_mask = (input_ < self.vocab_start_index) | \
+                     (input_ >= self.vocab_end_index)
+        # Mask the input.
+        masked_input = input_.clone() - self.vocab_start_index
+        masked_input[input_mask] = 0
+        # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the embedding dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 init_method=init.xavier_normal_,
+                 keep_master_weight_for_test=False):
+        super(ParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set some detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the embedding dimension.
+        world_size = get_model_parallel_world_size()
+        self.embedding_dim_per_partition = divide(self.embedding_dim,
+                                                  world_size)
+
+        # Allocate weights.
+        self.weight = Parameter(
+            torch.Tensor(self.num_embeddings,
+                         self.embedding_dim_per_partition))
+        self.weight.model_parallel = True
+        # And initialize.
+        _initialize_affine_weight(
+            self.weight,
+            self.num_embeddings,
+            self.embedding_dim,
+            self.embedding_dim_per_partition,
+            1,
+            init_method,
+            stride=1,
+            return_master_weight=False)
+
+    def forward(self, input_):
+        input_parallel = copy_to_model_parallel_region(input_)
+        output_parallel = F.embedding(input_parallel, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        output = gather_from_model_parallel_region(output_parallel)
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 gather_output=True,
+                 init_method=init.xavier_normal_,
+                 stride=1,
+                 keep_master_weight_for_test=False):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(
+            torch.Tensor(self.output_size_per_partition, self.input_size))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size_per_partition))
+            self.bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight,
+            self.output_size,
+            self.input_size,
+            self.output_size_per_partition,
+            0,
+            init_method,
+            stride=stride,
+            return_master_weight=keep_master_weight_for_test)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight, self.bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+    """
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 bias=True,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_,
+                 stride=1,
+                 keep_master_weight_for_test=False):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.weight = Parameter(
+            torch.Tensor(self.output_size, self.input_size_per_partition))
+        self.weight.model_parallel = True
+        if bias:
+            self.bias = Parameter(torch.Tensor(self.output_size))
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        # Initialize weight.
+        self.master_weight = _initialize_affine_weight(
+            self.weight,
+            self.output_size,
+            self.input_size,
+            self.input_size_per_partition,
+            1,
+            init_method,
+            stride=stride,
+            return_master_weight=keep_master_weight_for_test)
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        output_ = reduce_from_model_parallel_region(output_parallel)
+        if self.bias is not None:
+            output = output_ + self.bias
+        else:
+            output = output_
+        return output
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/mappings.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/mappings.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_model_parallel_group
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=group)
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Split along last dimension.
+    world_size = torch.distributed.get_world_size(group=group)
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = torch.distributed.get_rank(group=group)
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = torch.distributed.get_rank(group=group)
+    world_size = torch.distributed.get_world_size(group=group)
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=group)
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-redcue the input from the model parallel region."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+
+def copy_to_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/random.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/random.py
@@ -0,0 +1,404 @@
+# Modified by Samyam Rajbhandari
+# Used to partition the activations stored for backward propagation
+# Therefore reduces the memory consumption
+
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+import contextlib
+
+import torch
+import torch.distributed as dist
+from torch import _C
+from torch.cuda import _lazy_call
+from torch.cuda import device as device_ctx_manager
+
+from .initialize import (get_data_parallel_rank, get_model_parallel_group,
+                         get_model_parallel_rank,
+                         get_model_parallel_world_size)
+
+PARTITION_ACTIVATIONS = False
+PA_CORRECTNESS_TEST = False
+
+
+def see_memory_usage(message, force=False):
+    if not force:
+        return
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print(message)
+        print('Memory Allocated ',
+              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Max Memory Allocated ',
+              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print('Cache Allocated ',
+              torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
+        print('Max cache Allocated ',
+              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
+              'GigaBytes')
+        print(' ')
+
+
+mp_rank = None  # get_model_parallel_rank()
+mp_size = None  # get_model_parallel_world_size()
+mp_group = None  # get_model_parallel_group()
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+transport_stream = None
+cuda_device = None
+
+
+def detach_variable(inputs, device=None):
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+
+            requires_grad = inp.requires_grad
+
+            if device is not None:
+                x = inp.to(device=device)
+            else:
+                x = inp
+
+            x = x.detach()
+            x.requires_grad = requires_grad
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            'Only tuple of tensors is supported. Got Unsupported input type: ',
+            type(inputs).__name__)
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-model-parallel regions.
+        model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    model_parallel_seed = offset + get_model_parallel_rank()
+    # Data parallel gets the original sedd.
+    data_parallel_seed = seed
+
+    if torch.distributed.get_rank() == 0:
+        print(
+            '> initializing model parallel cuda seeds on global rank {}, '
+            'model parallel rank {}, and data parallel rank {} with '
+            'model parallel seed: {} and data parallel seed: {}'.format(
+                torch.distributed.get_rank(), get_model_parallel_rank(),
+                get_data_parallel_rank(), model_parallel_seed,
+                data_parallel_seed),
+            flush=True)
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                model_parallel_seed)
+
+
+def get_partition_start(item):
+    global mp_rank, mp_size, mp_group
+    partition_size = get_partition_size(item)
+    start = partition_size * mp_rank
+    return int(start)
+
+
+def get_partition_size(item):
+    global mp_rank, mp_size, mp_group
+    size = item.numel()
+    partition_size = size / mp_size
+    return int(partition_size)
+
+
+def get_full_inputs(tensors):
+    inputs = []
+    for i in range(int(len(tensors) / 2) - 1):
+        item = tensors[2 * i]
+        size = tensors[2 * i + 1]
+        partition_size = item.numel()
+        tensor_size = partition_size * mp_size
+        flat_tensor = torch.zeros([tensor_size],
+                                  dtype=item.dtype,
+                                  device=item.device)
+        partitions = []
+        for i in range(mp_size):
+            part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
+            if i == mp_rank:
+                part_i.copy_(item)
+            partitions.append(part_i)
+        dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
+        input_tensor = flat_tensor.view(list(size.numpy()))
+        item.data = input_tensor.data
+
+        inputs.append(item)
+    inputs.append(tensors[-2])
+
+    return tuple(inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+        global mp_rank, mp_size, mp_group
+        if mp_rank is None:
+            mp_rank = get_model_parallel_rank()
+            mp_size = get_model_parallel_world_size()
+            mp_group = get_model_parallel_group()
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+        if cuda_device is None:
+            if dist.get_rank() == 0:
+                print(
+                    f'Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}'
+                )
+
+            cuda_device = torch.cuda.current_device()
+            # The transport stream is used to overlap the allgather communication for the activations
+            # with the computation in the backward pass
+            transport_stream = torch.cuda.Stream(device=cuda_device)
+
+        if PARTITION_ACTIVATIONS:
+            inputs = [
+                item.detach().contiguous().view(-1).narrow(
+                    0, get_partition_start(item),
+                    get_partition_size(item)).clone() for item in args[:-1]
+            ]
+            inputs.append(args[-1])
+
+        # just in case something funky is happening such as reuse of inputs
+        inputs_cuda = [item.to(cuda_device) for item in args]
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*inputs_cuda)
+
+        del inputs_cuda
+
+        if PARTITION_ACTIVATIONS:
+            new_args = []
+            for arg, inp in zip(args, inputs):
+                size = torch.tensor(arg.size())
+                arg.data = inp.data
+                new_args.append(arg)
+                new_args.append(size)
+            ctx.save_for_backward(*new_args)
+        else:
+            ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError('Checkpointing is not compatible with .grad(), '
+                               'please use .backward() if possible')
+
+        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+
+        if PARTITION_ACTIVATIONS:
+            with torch.cuda.stream(transport_stream):
+                inputs = get_full_inputs(ctx.saved_tensors)
+                detached_inputs = detach_variable(inputs)
+        else:
+            inputs = ctx.saved_tensors
+            detached_inputs = detach_variable(inputs)
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        if PARTITION_ACTIVATIONS:
+            current_stream = torch.cuda.current_stream()
+            current_stream.wait_stream(transport_stream)
+
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs, )
+        torch.autograd.backward(outputs, args)
+        return (None, ) + tuple(inp.grad for inp in detached_inputs)
+
+
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
+
+
+def partition_activations_in_checkpoint(partition_activation):
+    global PARTITION_ACTIVATIONS
+    PARTITION_ACTIVATIONS = partition_activation
+    if dist.get_rank() == 0:
+        print(
+            f'**************Partition Activations {PARTITION_ACTIVATIONS}************'
+        )
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/init.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/init.py
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/commons.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/commons.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+
+import gpt2.mpu as mpu
+import numpy
+import torch
+
+
+class IdentityLayer(torch.nn.Module):
+
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--local_rank',
+        type=int,
+        default=None,
+        help='local rank passed from distributed launcher')
+    args = parser.parse_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv('WORLD_SIZE', '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_cross_entropy.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_cross_entropy.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+import gpt2.mpu as mpu
+import torch
+import torch.nn.functional as F
+from commons import (IdentityLayer, initialize_distributed, print_separator,
+                     set_random_seed)
+from mpu.cross_entropy import vocab_parallel_cross_entropy
+
+sys.path.append('../..')
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale,
+                        seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(size=(batch_size,
+                                         seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(
+        logits.view(-1,
+                    logits.size()[-1]), target.view(-1),
+        reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = mpu.scatter_to_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(size=(batch_size,
+                                         seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
+                                                 vocab_size, logits_scale,
+                                                 seed)
+    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, vocab_size,
+                                           logits_scale, seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_data.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_data.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+import sys
+
+import gpt2.mpu as mpu
+import torch
+from commons import initialize_distributed, print_separator
+from mpu import data as data_utils
+
+sys.path.append('../..')
+
+
+def test_boradcast_data(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print(
+            '> testing boradcast_data with model parallel size {} ...'.format(
+                model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    key_size_t = {
+        'key1': [7, 11],
+        'key2': [8, 2, 1],
+        'key3': [13],
+        'key4': [5, 1, 2],
+        'key5': [5, 12]
+    }
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if mpu.get_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test test boradcast data')
+        test_boradcast_data(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_initialize.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_initialize.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import gpt2.mpu as mpu
+import torch
+from commons import initialize_distributed, print_separator
+
+sys.path.append('../..')
+
+
+def test_initialize_model_parallel(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_model_parallel with size {} ...'.format(
+            model_parallel_size))
+    model_parallel_size_ = min(model_parallel_size,
+                               torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size_)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks.
+    def check(group, world_size, rank):
+        assert world_size == torch.distributed.get_world_size(group=group)
+        assert rank == torch.distributed.get_rank(group=group)
+
+    # Model parallel.
+    world_size = model_parallel_size_
+    rank = torch.distributed.get_rank() % model_parallel_size_
+    assert world_size == mpu.get_model_parallel_world_size()
+    assert rank == mpu.get_model_parallel_rank()
+    check(mpu.get_model_parallel_group(), world_size, rank)
+
+    # Data parallel.
+    world_size = torch.distributed.get_world_size() // model_parallel_size_
+    rank = torch.distributed.get_rank() // model_parallel_size
+    assert world_size == mpu.get_data_parallel_world_size()
+    assert rank == mpu.get_data_parallel_rank()
+    check(mpu.get_data_parallel_group(), world_size, rank)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_get_model_parallel_src_rank(model_parallel_size_):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing get_model_parallel_src_rank with size {} ...'.format(
+            model_parallel_size_))
+    model_parallel_size = min(model_parallel_size_,
+                              torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(model_parallel_size)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks
+    src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
+    assert mpu.get_model_parallel_src_rank() == src_rank
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test initialize model parallel')
+        test_initialize_model_parallel(model_parallel_size)
+        print_separator('test model parallel source rank')
+        test_get_model_parallel_src_rank(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_layers.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_layers.py
@@ -0,0 +1,533 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+import gpt2.mpu as mpu
+import torch
+import torch.nn.init as init
+from commons import initialize_distributed, print_separator, set_random_seed
+from mpu import layers
+from torch.nn.parameter import Parameter
+
+sys.path.append('../..')
+
+
+def test_parallel_embedding(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing parallel embedding with model parallel size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    batch_size = 17
+    seq_length = 23
+    vocab_size = 48
+    hidden_size = 16
+    seed = 1236
+
+    set_random_seed(123)
+    input_data = torch.LongTensor(size=(batch_size, seq_length)).random_(
+        0, vocab_size).cuda()
+    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
+
+    set_random_seed(seed)
+    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
+
+    output = embedding_original(input_data)
+    loss_original = torch.mul(output, loss_weight).sum()
+    loss_original.backward()
+
+    set_random_seed(seed)
+    embedding_parallel = layers.ParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_parallel(input_data)
+    loss_parallel = torch.mul(output, loss_weight).sum()
+    loss_parallel.backward()
+
+    set_random_seed(seed)
+    embedding_vocab_parallel = layers.VocabParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_vocab_parallel(input_data)
+    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
+    loss_vocab_parallel.backward()
+
+    torch.distributed.barrier()
+    error = loss_parallel.sub(loss_original).abs()
+    print('   error in loss (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    error = loss_vocab_parallel.sub(loss_original).abs()
+    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   hidden_size // model_parallel_size,
+                                   1)[mpu.get_model_parallel_rank()]
+    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
+    print('   error in grad (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   vocab_size // model_parallel_size,
+                                   0)[mpu.get_model_parallel_rank()]
+    error = embedding_vocab_parallel.weight.grad.sub(
+        weight_grad_orig).abs().max()
+    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_initialize_affine_weight(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_affine_weight with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+
+    # ---------------
+    # Column parallel
+    # ---------------
+    weight = torch.empty(output_size_coeff, input_size)
+    set_random_seed(seed)
+    layers._initialize_affine_weight(weight, output_size, input_size,
+                                     output_size_coeff, 0,
+                                     torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(
+        master_weight, output_size_coeff, dim=0)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   column parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # ------------
+    # Row parallel
+    # ------------
+    weight = torch.empty(output_size, input_size_coeff)
+    set_random_seed(seed)
+    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
+                                         input_size_coeff, 1,
+                                         torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_model_parallel_rank()
+    my_weight = torch.split(
+        master_weight, input_size_coeff, dim=1)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   row parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer2D(torch.nn.Module):
+
+    def __init__(self, m, n):
+        super(IdentityLayer2D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def test_column_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing ColumnParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.ColumnParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(
+        dLdA, output_size_coeff, dim=0)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    my_dLdb = torch.split(
+        dLdb, output_size_coeff, dim=0)[rank].contiguous().clone()
+    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def test_row_parallel_linear(model_parallel_size):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing RowParallelLinear with model parallel '
+              'size: {}'.format(model_parallel_size))
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.RowParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_model_parallel_rank()
+    my_dLdA = torch.split(
+        dLdA, input_size_coeff, dim=1)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer3D(torch.nn.Module):
+
+    def __init__(self, m, n, k):
+        super(IdentityLayer3D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n, k))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
+                            hidden_size_per_att_head, dropout_prob, batch_size,
+                            sequence_length):
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * \
+                    torch.distributed.get_world_size() # noqa
+    hidden_size = hidden_size_per_att_head * num_att_heads
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
+                                                    dropout_prob).cuda()
+    loss_weight = torch.randn([batch_size, sequence_length,
+                               hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = attention_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer
+
+
+def test_parallel_self_attention(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelSelfAttention with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    dropout_prob = 0.0  # has to be zero
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
+        attention_layer_1, identity_layer_1 = parallel_self_attention(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) # noqa
+
+    rank, hidden_size, model_parallel_size, loss, \
+        attention_layer, identity_layer = parallel_self_attention(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) # noqa
+    assert hideen_size_1 == hidden_size
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    my_lin_grad_list = torch.split(
+        attention_layer_1.query_key_value.weight.grad,
+        hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
+    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
+    error = my_lin_grad.sub(
+        attention_layer.query_key_value.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   weight gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
+                         hidden_size_per_att_head, batch_size,
+                         sequence_length):
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * \
+                    torch.distributed.get_world_size() # noqa
+    hidden_size = hidden_size_per_att_head * num_att_heads
+    intermediate_size = 4 * hidden_size
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    transformer_layer = mpu.BertParallelTransformerLayer(
+        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
+        torch.nn.functional.relu, 1.0e-5).cuda()
+
+    loss_weight = torch.randn([batch_size, sequence_length,
+                               hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = transformer_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer
+
+
+def test_parallel_transformer_layer(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelTransformerLayer with model parallel '
+              'size: {}'.format(model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
+        transformer_layer_1, identity_layer_1 = parallel_transformer(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    rank, hidden_size, model_parallel_size, loss, \
+        transformer_layer, identity_layer = parallel_transformer(
+            model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    print_separator('test initialize affine weight')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_initialize_affine_weight(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test parallel embedding')
+        test_parallel_embedding(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test column-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_column_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test row-parallel linear')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_row_parallel_linear(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel self-attention')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_self_attention(model_parallel_size)
+        model_parallel_size *= 2
+
+    print_separator('test parallel transformer')
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        test_parallel_transformer_layer(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_random.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_random.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import gpt2.mpu as mpu
+import torch
+from commons import initialize_distributed, print_separator
+
+sys.path.append('../..')
+
+
+def test_set_cuda_rng_state(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing set_rng_state with size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    size = 123
+    seed = 1234  # noqa
+    torch.cuda.manual_seed(1234)
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Get the state
+    rng_state = torch.cuda.get_rng_state()
+    rng_state_copy = rng_state.clone()
+
+    # Do some stuff.
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_1 = tensor.clone()
+
+    assert rng_state.sub(rng_state_copy).max() == 0
+    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+
+    # State should be different.
+    new_rng_state = torch.cuda.get_rng_state()
+    max_diff = new_rng_state.sub(rng_state).max()
+    print(
+        '   max diff in rng state (should be non-zero) on global rank {}: {}'.
+        format(torch.distributed.get_rank(), max_diff))
+    assert max_diff > 0
+
+    # Reset the rng state and do the same stuff.
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_2 = tensor.clone()
+
+    # Results should be the same
+    error = result_2.sub(result_1).abs().max()
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Input state should have remained intact.
+    error = rng_state.sub(rng_state_copy).max()
+    print('   max error in rng state (should be zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), error))
+    assert error == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_cuda_rng_tracker(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cuda rng tracker with size {} ...'.format(
+            model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    seed_1 = 1234
+    seed_2 = 4321
+    size = [12, 21]
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Set to seed_1 and generate two tensors.
+    torch.cuda.manual_seed(seed_1)
+    torch.randn(size, out=tensor)
+    target_11 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_12 = tensor.clone()
+
+    # Set to seed_2 and generate two tensors.
+    torch.cuda.manual_seed(seed_2)
+    torch.randn(size, out=tensor)
+    target_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_22 = tensor.clone()
+
+    # Now if we interleave seed_1 and seed_2,
+    # we should still get the same tensors
+    torch.cuda.manual_seed(seed_1)
+    mpu.get_cuda_rng_tracker().add('test', seed_2)
+
+    torch.randn(size, out=tensor)
+    result_11 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_21 = tensor.clone()
+
+    torch.randn(size, out=tensor)
+    result_12 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_22 = tensor.clone()
+
+    diff = result_11.sub(result_21).abs().max()
+    diff = min(diff, result_12.sub(result_22).abs().max())
+    print('   max diff in generated tensors (should be non-zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
+    assert diff > 1.0e-6
+    error = max(
+        result_11.sub(target_11).abs().max(),
+        result_12.sub(target_12).abs().max())
+    error = max(error, result_21.sub(target_21).abs().max())
+    error = max(error, result_22.sub(target_22).abs().max())
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_model_parallel_cuda_manual_seed(model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing model parallel cuda manual seed with size {} ...'.
+              format(model_parallel_size))
+
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+
+    mpu.model_parallel_cuda_manual_seed(12345)
+    assert torch.cuda.initial_seed() == 12345
+    with mpu.get_cuda_rng_tracker().fork():
+        assert torch.cuda.initial_seed() == (12345 + 2718
+                                             + mpu.get_model_parallel_rank())
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test set rng state')
+        test_set_cuda_rng_state(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cuda rng tracker')
+        test_cuda_rng_tracker(model_parallel_size)
+        model_parallel_size *= 2
+
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test model parallel cuda manual seed')
+        test_model_parallel_cuda_manual_seed(model_parallel_size)
+        model_parallel_size *= 2
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/transformer.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/transformer.py
@@ -0,0 +1,886 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer."""
+
+import math
+
+import deepspeed
+import torch
+import torch.nn.init as init
+from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+
+from .initialize import get_model_parallel_world_size
+from .layers import ColumnParallelLinear, RowParallelLinear
+from .mappings import gather_from_model_parallel_region
+from .random import checkpoint, get_cuda_rng_tracker
+from .utils import divide, split_tensor_along_last_dim
+
+
+class PositionalEmbedding(torch.nn.Module):
+
+    def __init__(self, hidden_size):
+        super(PositionalEmbedding, self).__init__()
+
+        self.hidden_size = hidden_size
+
+        inv_freq = 1 / (
+            10000**(torch.arange(0.0, hidden_size, 2.0) / hidden_size))  # noqa
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+
+
+class GPT2ParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for GPT2.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size to be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        init_method: weight initialization.
+        output_layer_init_method: output layer initialization. If None, use
+                                  `init_method`.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False):
+        super(GPT2ParallelSelfAttention, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(
+            num_attention_heads, world_size)
+        self.relative_encoding = relative_encoding
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(
+            hidden_size,
+            3 * hidden_size,
+            stride=3,
+            gather_output=False,
+            init_method=init_method)
+        if relative_encoding:
+            self.relative = ColumnParallelLinear(
+                hidden_size,
+                self.hidden_size_per_partition,
+                gather_output=False,
+                init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
+
+        # Output.
+        self.dense = RowParallelLinear(
+            hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.output_dropout = torch.nn.Dropout(output_dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """
+        new_tensor_shape = tensor.size()[:-1] + \
+                           (self.num_attention_heads_per_partition, # noqa
+                            self.hidden_size_per_attention_head) # noqa
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    @staticmethod
+    def _rel_shift(x, zero_triu=False):
+        # ql x kl x bsz x h
+        # bsz x h x ql x kl
+        zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2))
+
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None]
+
+        return x
+
+    @staticmethod
+    def _rel_shift_latest(x: torch.Tensor):
+        ndims = x.dim()
+        x_shape = x.size()
+        row_dim = 2
+        col_dim = row_dim + 1
+        assert col_dim < ndims
+        tgt_shape_1, tgt_shape_2 = [], []
+        for i in range(ndims):
+            if i == row_dim:
+                tgt_shape_1.append(x_shape[col_dim])
+                tgt_shape_2.append(x_shape[row_dim])
+            elif i == col_dim:
+                tgt_shape_1.append(x_shape[row_dim])
+                tgt_shape_2.append(x_shape[col_dim] - 1)
+            else:
+                tgt_shape_1.append(x_shape[i])
+                tgt_shape_2.append(x_shape[i])
+        x = x.view(*tgt_shape_1)
+        x = x[:, :, 1:, :]
+        x = x.view(*tgt_shape_2)
+        return x
+
+    def forward(self,
+                hidden_states,
+                ltor_mask,
+                position_embeddings=None,
+                r_w_bias=None,
+                r_r_bias=None,
+                mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Attention heads. [b, s, hp]
+        query_length = hidden_states.size(1)
+
+        if mem is None:
+            mixed_x_layer = self.query_key_value(hidden_states)
+            (mixed_query_layer, mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(
+                 mixed_x_layer, 3)
+        else:
+            cat = torch.cat((mem, hidden_states), 1)
+            mixed_x_layer = self.query_key_value(cat)
+            (mixed_query_layer, mixed_key_layer,
+             mixed_value_layer) = split_tensor_along_last_dim(
+                 mixed_x_layer, 3)
+            mixed_query_layer = mixed_query_layer[:, -query_length:]
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+        if self.relative_encoding:
+            relative_layer = self.relative(position_embeddings)
+            relative_layer = self._transpose_for_scores(
+                relative_layer)  # 1 (bsz) x n_head x klen x d_head
+            # Raw attention scores. [b, np, qs, ks]
+            rw_head_q = query_layer + r_w_bias.unsqueeze(1)
+            ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2))
+            rr_head_q = query_layer + r_r_bias.unsqueeze(1)
+            bd_score = torch.matmul(rr_head_q,
+                                    relative_layer.transpose(-1, -2))
+            bd_score = self._rel_shift(bd_score)  # qlen x klen x bsz x n_head
+            # bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen
+
+            attention_scores = ac_score + bd_score
+        else:
+            # Raw attention scores. [b, np, s, s]
+            attention_scores = torch.matmul(query_layer,
+                                            key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.hidden_size_per_attention_head)
+        # Apply the left to right attention mask.
+        attention_scores = torch.mul(attention_scores, ltor_mask) - \
+                           10000.0 * (1.0 - ltor_mask) # noqa
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,) # noqa
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        output = self.dense(context_layer)
+        output = self.output_dropout(output)
+
+        return output
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (
+        1.0 + torch.tanh(0.7978845608028654 * x *  # noqa
+                         (1.0 + 0.044715 * x * x)))  # noqa
+
+
+def gelu(x):
+    return gelu_impl(x)
+
+
+class GPT2ParallelMLP(torch.nn.Module):
+    """MLP for GPT2.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layer initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 output_dropout_prob,
+                 init_method,
+                 output_layer_init_method=None):
+        super(GPT2ParallelMLP, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        # Project to 4h.
+        self.dense_h_to_4h = ColumnParallelLinear(
+            hidden_size,
+            4 * hidden_size,
+            gather_output=False,
+            init_method=init_method)
+        # Project back to h.
+        self.dense_4h_to_h = RowParallelLinear(
+            4 * hidden_size,
+            hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method)
+        self.dropout = torch.nn.Dropout(output_dropout_prob)
+
+    def forward(self, hidden_states):
+        # [b, s, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = gelu(intermediate_parallel)
+
+        # [b, s, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        output = self.dropout(output)
+        return output
+
+
+class GPT2ParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False):
+        super(GPT2ParallelTransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = torch.nn.LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = GPT2ParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            relative_encoding=relative_encoding)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = torch.nn.LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = GPT2ParallelMLP(
+            hidden_size,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method)
+
+    def forward(self,
+                hidden_states,
+                ltor_mask,
+                position_embeddings=None,
+                r_w_bias=None,
+                r_r_bias=None,
+                mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask,
+                                          position_embeddings, r_w_bias,
+                                          r_r_bias, mem)
+        # Residual connection.
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GPT2ParallelTransformer(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+
+    def __init__(self,
+                 num_layers,
+                 hidden_size,
+                 num_attention_heads,
+                 max_sequence_length,
+                 max_memory_length,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 checkpoint_activations,
+                 checkpoint_num_layers=1,
+                 layernorm_epsilon=1.0e-5,
+                 init_method_std=0.02,
+                 use_scaled_init_for_output_weights=True,
+                 relative_encoding=False):
+        super(GPT2ParallelTransformer, self).__init__()
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.max_memory_length = max_memory_length
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(
+                init_method_std, num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.relative_encoding = relative_encoding
+        if relative_encoding:
+            # Relative position embedding
+            self.position_embeddings = PositionalEmbedding(hidden_size)
+            # Per attention head and per partition values.
+            world_size = get_model_parallel_world_size()
+            self.hidden_size_per_attention_head = divide(
+                hidden_size, num_attention_heads)
+            self.num_attention_heads_per_partition = divide(
+                num_attention_heads, world_size)
+            self.r_w_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition,
+                             self.hidden_size_per_attention_head))
+            self.r_w_bias.model_parallel = True
+            self.r_r_bias = torch.nn.Parameter(
+                torch.Tensor(self.num_attention_heads_per_partition,
+                             self.hidden_size_per_attention_head))
+            self.r_r_bias.model_parallel = True
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.r_w_bias.zero_()
+                self.r_r_bias.zero_()
+        else:
+            # Position embedding (serial).
+            self.position_embeddings = torch.nn.Embedding(
+                max_sequence_length, hidden_size)
+            # Initialize the position embeddings.
+            torch.nn.init.normal_(
+                self.position_embeddings.weight, mean=0.0, std=init_method_std)
+
+        def get_layer():
+            return GPT2ParallelTransformerLayer(
+                hidden_size,
+                num_attention_heads,
+                attention_dropout_prob,
+                output_dropout_prob,
+                layernorm_epsilon,
+                unscaled_init_method(init_method_std),
+                output_layer_init_method=output_layer_init_method,
+                relative_encoding=relative_encoding)
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = torch.nn.LayerNorm(
+            hidden_size, eps=layernorm_epsilon)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def forward(self, hidden_states, position_ids, attention_mask, *mems):
+        batch_size, query_length = hidden_states.size()[:2]
+        memory_length = mems[0].size(1) if mems else 0
+        key_length = query_length + memory_length
+        attention_mask = attention_mask[:, :, :,
+                                        -query_length - memory_length:]
+        if self.relative_encoding:
+            hidden_states = self.embedding_dropout(hidden_states)
+            position_sequence = torch.arange(
+                key_length - 1,
+                -1,
+                -1.0,
+                device=hidden_states.device,
+                dtype=hidden_states.dtype)
+            position_embeddings = self.position_embeddings(position_sequence)
+            # Apply dropout
+            position_embeddings = self.embedding_dropout(position_embeddings)
+            hidden_states = self.embedding_dropout(hidden_states)
+        else:
+            position_embeddings = self.position_embeddings(position_ids)
+            hidden_states = hidden_states + position_embeddings
+            hidden_states = self.embedding_dropout(hidden_states)
+        if self.max_memory_length > 0:
+            mem_layers = [hidden_states.detach()]
+        else:
+            mem_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers_ = self.layers[start:end]
+                x_, inputs = inputs[0], inputs[1:]
+                if self.relative_encoding:
+                    inputs, mems_ = inputs[:4], inputs[4:]
+                else:
+                    inputs, mems_ = inputs[:1], inputs[1:]
+                for i, layer in enumerate(layers_):
+                    mem_i_ = mems_[i] if mems_ else None
+                    x_ = layer(x_, *inputs, mem=mem_i_)
+                    if self.max_memory_length > 0:
+                        mem_layers.append(x_.detach())
+                return x_
+
+            return custom_forward
+
+        if self.checkpoint_activations:
+            l = 0  # noqa
+            num_layers = len(self.layers)
+            chunk_length = self.checkpoint_num_layers
+            while l < num_layers:
+                args = [hidden_states, attention_mask]
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                if mems:
+                    args += mems[l:l + chunk_length]
+                hidden_states = checkpoint(custom(l, l + chunk_length),
+                                           *args)  # noqa
+                l += chunk_length  # noqa
+        else:
+            for i, layer in enumerate(self.layers):
+                args = [hidden_states, attention_mask]
+                if self.relative_encoding:
+                    args += [position_embeddings, self.r_w_bias, self.r_r_bias]
+                mem_i = mems[i] if mems else None
+                hidden_states = layer(*args, mem=mem_i)
+                if self.max_memory_length > 0:
+                    mem_layers.append(hidden_states.detach())
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if self.max_memory_length > 0:
+            mem_layers = self.update_mems(mem_layers, mems)
+
+        return (output, *mem_layers)
+
+    def update_mems(self, hiddens, mems):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = min(self.max_memory_length,
+                                memory_length + query_length)
+        new_mems = []
+        with torch.no_grad():
+            for i in range(len(hiddens)):
+                if new_memory_length <= query_length:
+                    new_mems.append(hiddens[i][:, -new_memory_length:])
+                else:
+                    new_mems.append(
+                        torch.cat(
+                            (mems[i][:, -new_memory_length + query_length:],
+                             hiddens[i]),
+                            dim=1))
+        return new_mems
+
+
+class BertParallelSelfAttention(torch.nn.Module):
+    """Parallel self-attention layer for BERT.
+
+    Self-attention layer takes input with size [b, s, h] where b is
+    the batch size, s is the sequence lenght, and h is the hidden size
+    and creates output of the same size.
+    Arguments:
+        hidden_size: total hidden size of the layer (h).
+        num_attention_heads: number of attention heads (n). Note that we
+                             require n to be divisible by number of GPUs
+                             used to parallelize the model. Also, we
+                             require hidden size be divisible by n.
+        dropout_prob: dropout probability for the attention scores.
+        output_parallel: If true, no all-gather is done on the output and
+                         the output values will be per partition.
+    We use the following notation:
+        h: hidden_size
+        n: num_attention_heads
+        p: number of partitions
+        np: n/p
+        hp: h/p
+        hn: h/n
+        b: batch size
+        s: sequence length
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 dropout_prob,
+                 output_parallel=False,
+                 init_method=init.xavier_normal_):
+        super(BertParallelSelfAttention, self).__init__()
+        # Input configuration.
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.dropout_prob = dropout_prob
+        self.output_parallel = output_parallel
+        # Per attention head and per partition values.
+        world_size = get_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = divide(hidden_size,
+                                                     num_attention_heads)
+        self.num_attention_heads_per_partition = divide(
+            num_attention_heads, world_size)
+        # Strided linear layer.
+        self.query_key_value = ColumnParallelLinear(
+            hidden_size,
+            3 * hidden_size,
+            stride=3,
+            gather_output=False,
+            init_method=init_method)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.dropout = torch.nn.Dropout(dropout_prob)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _transpose_for_scores(self, tensor):
+        """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
+        size [b, np, s, hn].
+        """ # noqa
+        new_tensor_shape = tensor.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head)
+        tensor = tensor.view(*new_tensor_shape)
+        return tensor.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+
+        # Attention heads. [b, s, hp]
+        mixed_x_layer = self.query_key_value(hidden_states)
+        (mixed_query_layer, mixed_key_layer,
+         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # Reshape and transpose [b, np, s, hn]
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+
+        # Raw attention scores. [b, np, s, s]
+        norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
+        attention_scores = torch.matmul(
+            query_layer / norm_factor,
+            key_layer.transpose(-1, -2) / norm_factor)
+        # Apply the attention mask.
+        attention_scores += attention_mask
+
+        # Attention probabilities. [b, np, s, s]
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with get_cuda_rng_tracker().fork():
+            attention_probs = self.dropout(attention_probs)
+
+        # Context layer.
+        # [b, np, s, hn]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # [b, s, np, hn]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.hidden_size_per_partition, )  # noqa
+        # [b, s, hp]
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # Output. [b, s, h]
+        if self.output_parallel:
+            output = context_layer
+        else:
+            output = gather_from_model_parallel_region(context_layer)
+
+        return output
+
+
+class BertParallelTransformerOutput(torch.nn.Module):
+    """The output layer used after self attention and intermediate
+    parts of transformer layer."""
+
+    def __init__(self,
+                 input_size,
+                 output_size,
+                 dropout_prob,
+                 layernorm_epsilon=1.0e-12,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_):
+        super(BertParallelTransformerOutput, self).__init__()
+        # Components.
+        self.dense = RowParallelLinear(
+            input_size,
+            output_size,
+            input_is_parallel=input_is_parallel,
+            init_method=init_method)
+        self.dropout = torch.nn.Dropout(dropout_prob)
+        self.layernorm = torch.nn.LayerNorm(output_size, eps=layernorm_epsilon)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        layernorm_input = hidden_states + input_tensor
+        hidden_states = self.layernorm(layernorm_input)
+        return hidden_states
+
+
+class BertParallelTransformerLayer(torch.nn.Module):
+    """A single layer transformer for Bert.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        intermediate_size: size of the intermediate state after
+                           self attention. In both BERT and GPT
+                           this is set to be 4 times the hidden
+                           size.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        intermediate_activation_fn: activation function for output
+                                    of intermediate.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 intermediate_activation_fn,
+                 layernorm_epsilon,
+                 init_method=init.xavier_normal_):
+        super(BertParallelTransformerLayer, self).__init__()
+
+        # Self attention.
+        self.attention = BertParallelSelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_parallel=True,
+            init_method=init_method)
+        # Self attention output.
+        self.self_output = BertParallelTransformerOutput(
+            hidden_size,
+            hidden_size,
+            output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method)
+        # Intermediate.
+        self.intermediate = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            gather_output=False,
+            init_method=init_method)
+        self.intermediate_activation_fn = intermediate_activation_fn
+        # Output.
+        self.output = BertParallelTransformerOutput(
+            intermediate_size,
+            hidden_size,
+            output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            input_is_parallel=True,
+            init_method=init_method)
+
+    def forward(self, hidden_states, attention_mask):
+        # [b, s, hp]
+        attention_output_parallel = self.attention(hidden_states,
+                                                   attention_mask)
+        # [b, s, h]
+        attention_self_output = self.self_output(attention_output_parallel,
+                                                 hidden_states)
+        # [b, s, ip]
+        intermediate_output_parallel = self.intermediate(attention_self_output)
+        intermediate_output_parallel = self.intermediate_activation_fn(
+            intermediate_output_parallel)
+        # [b, s, h]
+        layer_output = self.output(intermediate_output_parallel,
+                                   attention_self_output)
+
+        return layer_output
--- a/modelscope/models/nlp/txl_poem/gpt2/mpu/utils.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/mpu/utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor,
+                                num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                  rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank,
+                                           world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size)
--- a/modelscope/models/nlp/txl_poem/gpt2/utils.py
+++ b/modelscope/models/nlp/txl_poem/gpt2/utils.py
@@ -0,0 +1,440 @@
+# Modified by Zhipu.AI
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for logging and serialization"""
+
+import os
+import random
+import time
+
+import numpy as np
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+
+from . import model, mpu
+from .fp16 import FP16_Optimizer
+
+SUMMARY_WRITER_DIR_NAME = 'runs'
+
+
+def print_rank_0(message):
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+
+def print_args(args):
+    """Print arguments."""
+
+    print('arguments:', flush=True)
+    for arg in vars(args):
+        dots = '.' * (29 - len(arg))
+        print('  {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
+
+
+def print_params_min_max_norm(optimizer, iteration):
+    """Print min, max, and norm of all parameters."""
+    index = 0
+    rank = torch.distributed.get_rank()
+    string = 'iteration, rank, index, model-parallel,min, max, norm\n'
+    optimizer_ = optimizer
+    if isinstance(optimizer, FP16_Optimizer):
+        optimizer_ = optimizer.optimizer
+    for param_group in optimizer_.param_groups:
+        for param in param_group['params']:
+            index += 1
+            min_ = param.data.min()
+            max_ = param.data.max()
+            norm = param.data.norm()
+            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
+                iteration, rank, index, int(param.model_parallel))
+            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
+    print(string, flush=True)
+
+
+class Timers:
+    """Group of timers."""
+
+    class Timer:
+        """Timer."""
+
+        def __init__(self, name):
+            self.name_ = name
+            self.elapsed_ = 0.0
+            self.started_ = False
+            self.start_time = time.time()
+
+        def start(self):
+            """Start the timer."""
+            assert not self.started_, 'timer has already been started'
+            torch.cuda.synchronize()
+            self.start_time = time.time()
+            self.started_ = True
+
+        def stop(self):
+            """Stop the timer."""
+            assert self.started_, 'timer is not started'
+            torch.cuda.synchronize()
+            self.elapsed_ += (time.time() - self.start_time)
+            self.started_ = False
+
+        def reset(self):
+            """Reset timer."""
+            self.elapsed_ = 0.0
+            self.started_ = False
+
+        def elapsed(self, reset=True):
+            """Calculate the elapsed time."""
+            started_ = self.started_
+            # If the timing in progress, end it first.
+            if self.started_:
+                self.stop()
+            # Get the elapsed time.
+            elapsed_ = self.elapsed_
+            # Reset the elapsed time
+            if reset:
+                self.reset()
+            # If timing was in progress, set it back.
+            if started_:
+                self.start()
+            return elapsed_
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = self.Timer(name)
+        return self.timers[name]
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        print_rank_0(string)
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(torch.cuda.memory_allocated()
+                                        / mega_bytes)
+    string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated()
+                                            / mega_bytes)
+    string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
+    string += ' | max cached: {}'.format(torch.cuda.memory_reserved()
+                                         / mega_bytes)
+    print_rank_0(string)
+
+
+def get_checkpoint_name(checkpoints_path,
+                        iteration,
+                        release=False,
+                        zero=False):
+    if release:
+        d = 'release'
+    else:
+        d = 'iter_{:07d}'.format(iteration)
+    if zero:
+        dp_rank = mpu.get_data_parallel_rank()
+        d += '_zero_dp_rank_{}'.format(dp_rank)
+    return os.path.join(checkpoints_path, d,
+                        'mp_rank_{:02d}'.format(mpu.get_model_parallel_rank()),
+                        'model_optim_rng.pt')
+
+
+def ensure_directory_exists(filename):
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+
+def get_checkpoint_tracker_filename(checkpoints_path):
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+
+
+def save_zero_checkpoint(args, iteration, optimizer):
+    zero_sd = {
+        'iteration': iteration,
+        'optimizer_state_dict': optimizer.state_dict()
+    }
+    zero_checkpoint_name = get_checkpoint_name(args.save, iteration, zero=True)
+    ensure_directory_exists(zero_checkpoint_name)
+    torch.save(zero_sd, zero_checkpoint_name)
+    print('  successfully saved {}'.format(zero_checkpoint_name))
+
+
+def save_checkpoint(iteration, model, optimizer, lr_scheduler, args):  # noqa
+    """Save a model checkpoint."""
+    if args.deepspeed:
+        save_ds_checkpoint(iteration, model, lr_scheduler, args)
+    else:
+        # Only rank zer0 of the data parallel writes to the disk.
+        if isinstance(model, torchDDP):
+            model = model.module
+
+        if mpu.get_data_parallel_rank() == 0:
+            checkpoint_name = get_checkpoint_name(args.save, iteration)
+            print(
+                'global rank {} is saving checkpoint at iteration {:7d} to {}'.
+                format(torch.distributed.get_rank(), iteration,
+                       checkpoint_name))
+
+            sd = {}
+            sd['iteration'] = iteration
+            sd['model'] = model.state_dict()
+
+            # Optimizer stuff.
+            if not args.no_save_optim:
+                if optimizer is not None:
+                    sd['optimizer'] = optimizer.state_dict()
+                if lr_scheduler is not None:
+                    sd['lr_scheduler'] = lr_scheduler.state_dict()
+
+            # rng states.
+            if not args.no_save_rng:
+                sd['random_rng_state'] = random.getstate()
+                sd['np_rng_state'] = np.random.get_state()
+                sd['torch_rng_state'] = torch.get_rng_state()
+                sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+                sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker(
+                ).get_states()
+
+            ensure_directory_exists(checkpoint_name)
+            torch.save(sd, checkpoint_name)
+            print('  successfully saved {}'.format(checkpoint_name))
+
+    # Wait so everyone is done (necessary)
+    torch.distributed.barrier()
+    # And update the latest iteration
+    if torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, 'w') as f:
+            f.write(str(iteration))
+    # Wait so everyone is done (not necessary)
+    torch.distributed.barrier()
+
+
+def save_ds_checkpoint(iteration, model, lr_scheduler, args):  # noqa
+    """Save a model checkpoint."""
+
+    sd = {}
+    sd['iteration'] = iteration
+    if lr_scheduler is not None:
+        sd['client_lr_scheduler'] = lr_scheduler.state_dict()
+    # rng states.
+    if not args.no_save_rng:
+        sd['random_rng_state'] = random.getstate()
+        sd['np_rng_state'] = np.random.get_state()
+        sd['torch_rng_state'] = torch.get_rng_state()
+        sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+        sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
+
+    model.save_checkpoint(args.save, str(iteration), client_state=sd)
+
+
+def get_checkpoint_iteration(args):
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(args.load)
+    if not os.path.isfile(tracker_filename):
+        print_rank_0('WARNING: could not find the metadata file {} '.format(
+            tracker_filename))
+        print_rank_0('    will not load any checkpoints and will start from '
+                     'random')
+        return 0, False, False
+    iteration = 0
+    release = False
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            release = metastring == 'release'
+            if not release:
+                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+                    tracker_filename))
+                exit()
+
+    assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+        tracker_filename)
+
+    return iteration, release, True
+
+
+def load_checkpoint(
+        model,  # noqa
+        optimizer,
+        lr_scheduler,
+        args,
+        load_optimizer_states=True):
+    """Load a model checkpoint."""
+
+    iteration, release, success = get_checkpoint_iteration(args)
+
+    if not success:
+        return 0
+
+    if args.deepspeed:
+
+        checkpoint_name, sd = model.load_checkpoint(
+            args.load, iteration, load_optimizer_states=not args.no_load_optim)
+        if 'client_lr_scheduler' in sd:
+            lr_scheduler.load_state_dict(sd['client_lr_scheduler'])
+            print_rank_0('Load lr scheduler state')
+        if checkpoint_name is None:
+            if mpu.get_data_parallel_rank() == 0:
+                print('Unable to load checkpoint.')
+            return iteration
+
+    else:
+
+        # Checkpoint.
+        checkpoint_name = get_checkpoint_name(args.load, iteration, release)
+
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        # Load the checkpoint.
+        sd = torch.load(checkpoint_name, map_location='cpu')
+
+        if isinstance(model, torchDDP):
+            model = model.module
+
+        # Model.
+        try:
+            model.load_state_dict(sd['model'])
+        except KeyError:
+            print_rank_0('A metadata file exists but unable to load model '
+                         'from checkpoint {}, exiting'.format(checkpoint_name))
+            exit()
+
+        # Optimizer.
+        if not release and not args.finetune and not args.no_load_optim:
+            try:
+                if optimizer is not None and load_optimizer_states:
+                    optimizer.load_state_dict(sd['optimizer'])
+                if lr_scheduler is not None:
+                    lr_scheduler.load_state_dict(sd['lr_scheduler'])
+            except KeyError:
+                print_rank_0(
+                    'Unable to load optimizer from checkpoint {}, exiting. '
+                    'Specify --no-load-optim or --finetune to prevent '
+                    'attempting to load the optimizer '
+                    'state.'.format(checkpoint_name))
+                exit()
+
+    # Iterations.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = sd['iteration']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = sd['total_iters']
+            except KeyError:
+                print_rank_0(
+                    'A metadata file exists but Unable to load iteration '
+                    ' from checkpoint {}, exiting'.format(checkpoint_name))
+                exit()
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(sd['random_rng_state'])
+            np.random.set_state(sd['np_rng_state'])
+            torch.set_rng_state(sd['torch_rng_state'])
+            torch.cuda.set_rng_state(sd['cuda_rng_state'])
+            mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
+        except KeyError:
+            print_rank_0(
+                'Unable to load optimizer from checkpoint {}, exiting. '
+                'Specify --no-load-rng or --finetune to prevent '
+                'attempting to load the random '
+                'state.'.format(checkpoint_name))
+            exit()
+
+    if mpu.get_data_parallel_rank() == 0:
+        print('  successfully loaded {}'.format(checkpoint_name))
+
+    return iteration
+
+
+def load_weights(src, dst, dst2src=False):
+    """
+    Loads weights from src to dst via in place copy.
+    src is a huggingface gpt2model, while dst is one of our models.
+    dst2src=True loads parameters from our models into huggingface's.
+    ^dst2src is still untested
+    """
+    conv_layer = 'Conv1D' in str(type(src))
+    for n, p in src.named_parameters():
+        if dst2src:
+            data = dst._parameters[n].data
+            load = p.data
+        else:
+            data = p.data
+            load = dst._parameters[n].data
+        if conv_layer and 'weight' in n:
+            data = data.t().contiguous()
+        load.copy_(data)
+
+
+#        dst._parameters[n].data.copy_(data)
+
+
+def load_mlp(our, oai, dst2src=False):
+    load_weights(oai.c_fc, our.dense_h_to_4h, dst2src)
+    load_weights(oai.c_proj, our.dense_4h_to_h, dst2src)
+
+
+def load_attention(our, oai, dst2src=False):
+    load_weights(oai.c_attn, our.query_key_value, dst2src)
+    load_weights(oai.c_proj, our.dense, dst2src)
+
+
+def load_transformer_layer(our, oai, dst2src=False):
+    load_weights(oai.ln_1, our.input_layernorm, dst2src)
+    load_weights(oai.ln_2, our.post_attention_layernorm, dst2src)
+    load_mlp(our.mlp, oai.mlp, dst2src)
+    load_attention(our.attention, oai.attn, dst2src)
+
+
+def move_weights(our, oai, dst2src=False):
+    """
+    Loads weights from `oai` to `our` via in place copy.
+    `oai` is a huggingface gpt2model, while `our` is one of our models.
+    dst2src=True loads parameters from our models into huggingface's.
+    ^dst2src=True is still untested
+    """
+    #    while isinstance(our, (torchDDP, model.distributed.DistributedDataParallel, FP16_Module)):
+    #        our=our.module
+    transformer_model = oai.transformer
+    load_weights(transformer_model.ln_f, our.transformer.final_layernorm,
+                 dst2src)
+    load_weights(transformer_model.wte, our.word_embeddings, dst2src)
+    load_weights(transformer_model.wpe, our.position_embeddings, dst2src)
+
+    for our_layer, oai_layer in zip(our.transformer.layers, oai.transformer.h):
+        load_transformer_layer(our_layer, oai_layer, dst2src)
--- a/modelscope/models/nlp/txl_poem/txl_for_fast_poem.py
+++ b/modelscope/models/nlp/txl_poem/txl_for_fast_poem.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import os
+from typing import Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from .fastpoem import fast_poem, prepare_model
+
+
+@MODELS.register_module(Tasks.fast_poem, module_name=Models.txl)
+class TXLForFastPoem(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fast poem model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        # initialize model
+        self.model, self.tokenizer, self.args = prepare_model(model_dir)
+
+    def forward(self, input: Dict[str, str]) -> Dict[str, str]:
+        pass
+
+    def generate(self, input: Dict[str, str]) -> Dict[str, str]:
+        res = fast_poem(input, self.model, self.tokenizer, self.args)
+        return {OutputKeys.TEXT: res['text']}
--- a/modelscope/pipelines/nlp/init.py
+++ b/modelscope/pipelines/nlp/init.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
    from .word_segmentation_pipeline import WordSegmentationPipeline
    from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
    from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline
+    from .txl_fast_poem_pipeline import TXLFastPoemPipeline
    from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \
        WordSegmentationThaiPipeline

@@ -73,6 +74,7 @@ else:
        'zero_shot_classification_pipeline':
        ['ZeroShotClassificationPipeline'],
        'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'],
+        'txl_fast_poem_pipeline': ['TXLFastPoemPipeline'],
        'multilingual_word_segmentation_pipeline': [
            'MultilingualWordSegmentationPipeline',
            'WordSegmentationThaiPipeline'
--- a/modelscope/pipelines/nlp/txl_fast_poem_pipeline.py
+++ b/modelscope/pipelines/nlp/txl_fast_poem_pipeline.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 Zhipu.AI
+
+from typing import Any, Dict, Optional, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.models.nlp import TXLForFastPoem
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Preprocessor, TXLFastPoemPreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['TXLFastPoemPipeline']
+
+
+@PIPELINES.register_module(
+    group_key=Tasks.fast_poem, module_name=Pipelines.txl_fast_poem)
+class TXLFastPoemPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[TXLForFastPoem, str],
+                 preprocessor: [Preprocessor] = None,
+                 *args,
+                 **kwargs):
+        model = TXLForFastPoem(model) if isinstance(model, str) else model
+        self.model = model
+        self.model.eval()
+        if preprocessor is None:
+            preprocessor = TXLFastPoemPreprocessor()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    # define the forward pass
+    def forward(self, inputs: Union[Dict, str],
+                **forward_params) -> Dict[str, Any]:
+        if isinstance(inputs, str):
+            inputs = {
+                'title': inputs,
+                'author': '李白',
+                'desc': '寂寞',
+                'lycr': 7,
+                'senlength': 4
+            }
+        else:
+            if 'title' not in inputs:
+                inputs['title'] = '月光'
+            if 'author' not in inputs:
+                inputs['author'] = '李白'
+            if 'desc' not in inputs:
+                inputs['desc'] = '寂寞'
+            if 'lycr' not in inputs:
+                inputs['lycr'] = 7
+            if 'senlength' not in inputs:
+                inputs['senlength'] = 4
+
+        return self.model.generate(inputs)
+
+    # format the outputs from pipeline
+    def postprocess(self, input, **kwargs) -> Dict[str, Any]:
+        return input
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
        SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor,
        TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor,
        TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize,
-        WordSegmentationBlankSetToLabelPreprocessor,
+        WordSegmentationBlankSetToLabelPreprocessor, TXLFastPoemPreprocessor,
        MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor,
        TextGenerationJiebaPreprocessor, SentencePiecePreprocessor,
        DialogIntentPredictionPreprocessor, DialogModelingPreprocessor,
@@ -57,7 +57,7 @@ else:
            'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
            'Tokenize', 'Text2TextGenerationPreprocessor',
            'WordSegmentationBlankSetToLabelPreprocessor',
-            'MGLMSummarizationPreprocessor',
+            'MGLMSummarizationPreprocessor', 'TXLFastPoemPreprocessor',
            'ZeroShotClassificationPreprocessor',
            'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
            'NERPreprocessorViet', 'NERPreprocessorThai',
--- a/modelscope/preprocessors/nlp/init.py
+++ b/modelscope/preprocessors/nlp/init.py
@@ -30,6 +30,7 @@ if TYPE_CHECKING:
    from .space_T_en import ConversationalTextToSqlPreprocessor
    from .space_T_cn import TableQuestionAnsweringPreprocessor
    from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
+    from .txl_fast_poem_preprocessor import TXLFastPoemPreprocessor
 else:
    _import_structure = {
        'nlp_base': [
@@ -64,6 +65,7 @@ else:
            'TextErrorCorrectionPreprocessor',
        ],
        'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'],
+        'txl_fast_poem_preprocessor': ['TXLFastPoemPreprocessor'],
        'token_classification_thai_preprocessor': [
            'NERPreprocessorThai',
            'WordSegmentationPreprocessorThai',
--- a/modelscope/preprocessors/nlp/txl_fast_poem_preprocessor.py
+++ b/modelscope/preprocessors/nlp/txl_fast_poem_preprocessor.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2022 Zhipu.AI
+
+import re
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+from modelscope.metainfo import Models, Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.txl_fast_poem)
+class TXLFastPoemPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """preprocess the data
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
+        return data
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -118,6 +118,7 @@ class NLPTasks(object):
    table_question_answering = 'table-question-answering'
    fill_mask = 'fill-mask'
    text_summarization = 'text-summarization'
+    fast_poem = 'fast-poem'
    question_answering = 'question-answering'
    zero_shot_classification = 'zero-shot-classification'
    backbone = 'backbone'
--- a/tests/pipelines/test_txl_fast_poem.py
+++ b/tests/pipelines/test_txl_fast_poem.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.preprocessors import TXLFastPoemPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class TXLTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.output_dir = 'unittest_output'
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_TXL_with_name(self):
+        model = 'ZhipuAI/TransformerXL-Fast-Poem'
+        preprocessor = TXLFastPoemPreprocessor()
+        pipe = pipeline(
+            task=Tasks.fast_poem,
+            model=model,
+            preprocessor=preprocessor,
+        )
+        inputs = {
+            'title': '明月',
+            'author': '杜甫',
+            'desc': '寂寞',
+            'lycr': 7,
+            'senlength': 4
+        }
+        result = pipe(inputs)
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()