mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 12:10:09 +01:00
init
This commit is contained in:
@@ -84,6 +84,7 @@ class Models(object):
|
||||
T5 = 'T5'
|
||||
mglm = 'mglm'
|
||||
bloom = 'bloom'
|
||||
txl = 'txl'
|
||||
|
||||
# audio models
|
||||
sambert_hifigan = 'sambert-hifigan'
|
||||
@@ -253,6 +254,7 @@ class Pipelines(object):
|
||||
document_segmentation = 'document-segmentation'
|
||||
feature_extraction = 'feature-extraction'
|
||||
mglm_text_summarization = 'mglm-text-summarization'
|
||||
txl_fast_poem = 'txl-fast-poem'
|
||||
translation_en_to_de = 'translation_en_to_de' # keep it underscore
|
||||
translation_en_to_ro = 'translation_en_to_ro' # keep it underscore
|
||||
translation_en_to_fr = 'translation_en_to_fr' # keep it underscore
|
||||
@@ -379,6 +381,7 @@ class Preprocessors(object):
|
||||
document_segmentation = 'document-segmentation'
|
||||
feature_extraction = 'feature-extraction'
|
||||
mglm_summarization = 'mglm-summarization'
|
||||
txl_fast_poem = 'txl-fast-poem'
|
||||
sentence_piece = 'sentence-piece'
|
||||
|
||||
# audio preprocessor
|
||||
|
||||
@@ -36,6 +36,7 @@ if TYPE_CHECKING:
|
||||
)
|
||||
from .T5 import T5ForConditionalGeneration
|
||||
from .mglm import MGLMForTextSummarization
|
||||
from .txl_poem import TXLForFastPoem
|
||||
from .task_models import (
|
||||
FeatureExtractionModel,
|
||||
InformationExtractionModel,
|
||||
@@ -108,6 +109,7 @@ else:
|
||||
'sentence_embedding': ['SentenceEmbedding'],
|
||||
'T5': ['T5ForConditionalGeneration'],
|
||||
'mglm': ['MGLMForTextSummarization'],
|
||||
'txl_poem': ['TXLForFastPoem'],
|
||||
'gpt_neo': ['GPTNeoModel'],
|
||||
'bloom': ['BloomModel'],
|
||||
}
|
||||
|
||||
22
modelscope/models/nlp/txl_poem/__init__.py
Executable file
22
modelscope/models/nlp/txl_poem/__init__.py
Executable file
@@ -0,0 +1,22 @@
|
||||
# Modified by Zhipu.AI
|
||||
# Original Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .txl_for_fast_poem import TXLForFastPoem
|
||||
else:
|
||||
_import_structure = {
|
||||
'txl_for_fast_poem': ['TXLForFastPoem'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
946
modelscope/models/nlp/txl_poem/arguments.py
Normal file
946
modelscope/models/nlp/txl_poem/arguments.py
Normal file
@@ -0,0 +1,946 @@
|
||||
# Modified by Zhipu.AI
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""argparser configuration"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
import deepspeed
|
||||
import json
|
||||
import torch
|
||||
|
||||
|
||||
def get_hostname():
|
||||
hostname_cmd = ['hostname -I']
|
||||
result = subprocess.check_output(hostname_cmd, shell=True)
|
||||
master_addr = result.decode('utf-8').split()[0]
|
||||
return master_addr
|
||||
|
||||
|
||||
def add_model_config_args(parser):
|
||||
"""Model arguments"""
|
||||
|
||||
group = parser.add_argument_group('model', 'model configuration')
|
||||
|
||||
group.add_argument(
|
||||
'--transformer-xl',
|
||||
action='store_true',
|
||||
help='use transformer-xl for training')
|
||||
group.add_argument(
|
||||
'--pretrained-bert',
|
||||
action='store_true',
|
||||
help='use a pretrained bert-large-uncased model instead'
|
||||
'of initializing from scratch. See '
|
||||
'--tokenizer-model-type to specify which pretrained '
|
||||
'BERT model to use')
|
||||
group.add_argument(
|
||||
'--encoder-decoder',
|
||||
action='store_true',
|
||||
help='use the encoder-decoder architecture for blocklm')
|
||||
group.add_argument(
|
||||
'--attention-dropout',
|
||||
type=float,
|
||||
default=0.1,
|
||||
help='dropout probability for attention weights')
|
||||
group.add_argument(
|
||||
'--num-attention-heads',
|
||||
type=int,
|
||||
default=16, # yuandong64
|
||||
help='num of transformer attention heads')
|
||||
group.add_argument(
|
||||
'--hidden-size',
|
||||
type=int,
|
||||
default=1024, # yuandong4096
|
||||
help='tansformer hidden size')
|
||||
group.add_argument(
|
||||
'--intermediate-size',
|
||||
type=int,
|
||||
default=None,
|
||||
help='transformer embedding dimension for FFN'
|
||||
'set to 4*`--hidden-size` if it is None')
|
||||
group.add_argument(
|
||||
'--num-layers',
|
||||
type=int,
|
||||
default=24, # yuandong48
|
||||
help='num decoder layers')
|
||||
group.add_argument(
|
||||
'--layernorm-epsilon',
|
||||
type=float,
|
||||
default=1e-5,
|
||||
help='layer norm epsilon')
|
||||
group.add_argument(
|
||||
'--hidden-dropout',
|
||||
type=float,
|
||||
default=0.1,
|
||||
help='dropout probability for hidden state transformer')
|
||||
group.add_argument(
|
||||
'--output-dropout',
|
||||
type=float,
|
||||
default=0.1,
|
||||
help='dropout probability for pooled output')
|
||||
group.add_argument(
|
||||
'--max-position-embeddings',
|
||||
type=int,
|
||||
default=512, # yuandong1024
|
||||
help='maximum number of position embeddings to use')
|
||||
group.add_argument(
|
||||
'--max-sequence-length',
|
||||
type=int,
|
||||
default=512,
|
||||
help='maximum number of position embeddings to use')
|
||||
group.add_argument(
|
||||
'--vocab-size',
|
||||
type=int,
|
||||
default=30522,
|
||||
help='vocab size to use for non-character-level '
|
||||
'tokenization. This value will only be used when '
|
||||
'creating a tokenizer')
|
||||
group.add_argument(
|
||||
'--deep-init',
|
||||
action='store_true',
|
||||
help='initialize bert model similar to gpt2 model.'
|
||||
'scales initialization of projection layers by a '
|
||||
'factor of 1/sqrt(2N). Necessary to train bert '
|
||||
'models larger than BERT-Large.')
|
||||
group.add_argument(
|
||||
'--make-vocab-size-divisible-by',
|
||||
type=int,
|
||||
default=128,
|
||||
help='Pad the vocab size to be divisible by this value.'
|
||||
'This is added for computational efficieny reasons.')
|
||||
group.add_argument(
|
||||
'--cpu-optimizer', action='store_true', help='Run optimizer on CPU')
|
||||
group.add_argument(
|
||||
'--cpu_torch_adam',
|
||||
action='store_true',
|
||||
help='Use Torch Adam as optimizer on CPU.')
|
||||
group.add_argument(
|
||||
'--sandwich-ln',
|
||||
action='store_true',
|
||||
help='add sandwich ln in cogview.')
|
||||
return parser
|
||||
|
||||
|
||||
def add_fp16_config_args(parser):
|
||||
"""Mixed precision arguments."""
|
||||
|
||||
group = parser.add_argument_group('fp16', 'fp16 configurations')
|
||||
|
||||
group.add_argument(
|
||||
'--fp16', action='store_true', help='Run model in fp16 mode')
|
||||
group.add_argument(
|
||||
'--fp32-embedding', action='store_true', help='embedding in fp32')
|
||||
group.add_argument(
|
||||
'--fp32-layernorm', action='store_true', help='layer norm in fp32')
|
||||
group.add_argument(
|
||||
'--fp32-tokentypes',
|
||||
action='store_true',
|
||||
help='embedding token types in fp32')
|
||||
group.add_argument(
|
||||
'--fp32-allreduce', action='store_true', help='all-reduce in fp32')
|
||||
group.add_argument(
|
||||
'--hysteresis',
|
||||
type=int,
|
||||
default=2,
|
||||
help='hysteresis for dynamic loss scaling')
|
||||
group.add_argument(
|
||||
'--loss-scale',
|
||||
type=float,
|
||||
default=None,
|
||||
help='Static loss scaling, positive power of 2 '
|
||||
'values can improve fp16 convergence. If None, dynamic'
|
||||
'loss scaling is used.')
|
||||
group.add_argument(
|
||||
'--loss-scale-window',
|
||||
type=float,
|
||||
default=1000,
|
||||
help='Window over which to raise/lower dynamic scale')
|
||||
group.add_argument(
|
||||
'--min-scale',
|
||||
type=float,
|
||||
default=1,
|
||||
help='Minimum loss scale for dynamic loss scale')
|
||||
group.add_argument('--attention-scale', type=float, default=1.0)
|
||||
return parser
|
||||
|
||||
|
||||
def add_training_args(parser):
|
||||
"""Training arguments."""
|
||||
|
||||
group = parser.add_argument_group('train', 'training configurations')
|
||||
|
||||
group.add_argument(
|
||||
'--experiment-name',
|
||||
type=str,
|
||||
default='gpt-345M',
|
||||
help='The experiment name for summary and checkpoint')
|
||||
group.add_argument(
|
||||
'--batch-size', type=int, default=4, help='Data Loader batch size')
|
||||
group.add_argument(
|
||||
'--gradient-accumulation-steps',
|
||||
type=int,
|
||||
default=1,
|
||||
help='Data Loader batch size')
|
||||
group.add_argument(
|
||||
'--weight-decay',
|
||||
type=float,
|
||||
default=0.01,
|
||||
help='weight decay coefficient for L2 regularization')
|
||||
group.add_argument(
|
||||
'--checkpoint-activations',
|
||||
action='store_true',
|
||||
help='checkpoint activation to allow for training '
|
||||
'with larger models and sequences')
|
||||
group.add_argument(
|
||||
'--checkpoint-num-layers',
|
||||
type=int,
|
||||
default=1,
|
||||
help='chunk size (number of layers) for checkpointing')
|
||||
group.add_argument(
|
||||
'--deepspeed-activation-checkpointing',
|
||||
action='store_true',
|
||||
help='uses activation checkpointing from deepspeed')
|
||||
group.add_argument(
|
||||
'--epochs',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Number of finetunning epochs. Zero results in evaluation only.')
|
||||
group.add_argument(
|
||||
'--clip-grad', type=float, default=1.0, help='gradient clipping')
|
||||
group.add_argument(
|
||||
'--train-iters',
|
||||
type=int,
|
||||
default=0, # 1000000->0
|
||||
help='total number of iterations to train over all training runs')
|
||||
group.add_argument('--label-smoothing', type=float, default=0.0)
|
||||
group.add_argument(
|
||||
'--log-interval', type=int, default=100, help='report interval')
|
||||
group.add_argument(
|
||||
'--exit-interval',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Exit the program after this many new iterations.')
|
||||
group.add_argument(
|
||||
'--summary-dir',
|
||||
type=str,
|
||||
default='',
|
||||
help='The directory to store the summary')
|
||||
group.add_argument('--seed', type=int, default=1234, help='random seed')
|
||||
# Batch prodecuer arguments
|
||||
group.add_argument(
|
||||
'--reset-position-ids',
|
||||
action='store_true',
|
||||
help='Reset posistion ids after end-of-document token.')
|
||||
group.add_argument(
|
||||
'--reset-attention-mask',
|
||||
action='store_true',
|
||||
help='Reset self attention maske after '
|
||||
'end-of-document token.')
|
||||
|
||||
# Learning rate.
|
||||
group.add_argument(
|
||||
'--lr-decay-iters',
|
||||
type=int,
|
||||
default=None,
|
||||
help='number of iterations to decay LR over,'
|
||||
' If None defaults to `--train-iters`*`--epochs`')
|
||||
group.add_argument(
|
||||
'--lr-decay-style',
|
||||
type=str,
|
||||
default='linear',
|
||||
choices=['constant', 'linear', 'cosine', 'exponential'],
|
||||
help='learning rate decay function')
|
||||
group.add_argument('--lr-decay-ratio', type=float, default=0.5)
|
||||
group.add_argument(
|
||||
'--lr', type=float, default=1.0e-4, help='initial learning rate')
|
||||
group.add_argument(
|
||||
'--warmup',
|
||||
type=float,
|
||||
default=0.01,
|
||||
help='percentage of data to warmup on (.01 = 1% of all '
|
||||
'training iters). Default 0.01')
|
||||
group.add_argument(
|
||||
'--switch-linear',
|
||||
action='store_true',
|
||||
help='Switch to linear decay for cosine decay')
|
||||
# model checkpointing
|
||||
group.add_argument(
|
||||
'--save',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Output directory to save checkpoints to.')
|
||||
group.add_argument('--new-save-directory', action='store_true')
|
||||
group.add_argument(
|
||||
'--save-epoch',
|
||||
type=int,
|
||||
default=1,
|
||||
help='number of epochs between saves')
|
||||
group.add_argument(
|
||||
'--save-interval',
|
||||
type=int,
|
||||
default=5000,
|
||||
help='number of iterations between saves')
|
||||
group.add_argument(
|
||||
'--no-save-optim',
|
||||
action='store_true',
|
||||
help='Do not save current optimizer.')
|
||||
group.add_argument(
|
||||
'--no-save-rng',
|
||||
action='store_true',
|
||||
help='Do not save current rng state.')
|
||||
group.add_argument(
|
||||
'--load',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to a directory containing a model checkpoint.')
|
||||
group.add_argument(
|
||||
'--no-load-optim',
|
||||
action='store_true',
|
||||
help='Do not load optimizer when loading checkpoint.')
|
||||
group.add_argument(
|
||||
'--no-load-rng',
|
||||
action='store_true',
|
||||
help='Do not load rng state when loading checkpoint.')
|
||||
group.add_argument(
|
||||
'--no-load-lr-scheduler',
|
||||
action='store_true',
|
||||
help='Do not load lr scheduler when loading checkpoint.')
|
||||
group.add_argument(
|
||||
'--no-deepspeed-load',
|
||||
action='store_true',
|
||||
help='Not use deepspeed when loading checkpoint')
|
||||
group.add_argument(
|
||||
'--finetune',
|
||||
action='store_true',
|
||||
help='Load model for finetuning. Do not load optimizer '
|
||||
'or rng state from checkpoint and set iteration to 0. '
|
||||
'Assumed when loading a release checkpoint.')
|
||||
group.add_argument(
|
||||
'--mode',
|
||||
type=str,
|
||||
default='pretrain',
|
||||
choices=['pretrain', 'finetune', 'inference'],
|
||||
help=
|
||||
'what type of task to use, will influence auto-warmup, exp name, iteration'
|
||||
)
|
||||
group.add_argument(
|
||||
'--resume-dataloader',
|
||||
action='store_true',
|
||||
help='Resume the dataloader when resuming training. '
|
||||
'Does not apply to tfrecords dataloader, try resuming'
|
||||
'with a different seed in this case.')
|
||||
# distributed training args
|
||||
group.add_argument(
|
||||
'--distributed-backend',
|
||||
default='nccl',
|
||||
help='which backend to use for distributed '
|
||||
'training. One of [gloo, nccl]')
|
||||
group.add_argument(
|
||||
'--DDP-impl',
|
||||
default='torch',
|
||||
choices=['local', 'torch', 'none'],
|
||||
help='which DistributedDataParallel implementation to use.')
|
||||
group.add_argument(
|
||||
'--local_rank',
|
||||
type=int,
|
||||
default=None,
|
||||
help='local rank passed from distributed launcher')
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def add_evaluation_args(parser):
|
||||
"""Evaluation arguments."""
|
||||
|
||||
group = parser.add_argument_group('validation',
|
||||
'validation configurations')
|
||||
|
||||
group.add_argument(
|
||||
'--eval-batch-size',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Data Loader batch size for evaluation datasets.'
|
||||
'Defaults to `--batch-size`')
|
||||
group.add_argument(
|
||||
'--eval-iters',
|
||||
type=int,
|
||||
default=100,
|
||||
help='number of iterations to run for evaluation'
|
||||
'validation/test for')
|
||||
group.add_argument(
|
||||
'--eval-interval',
|
||||
type=int,
|
||||
default=1000,
|
||||
help='interval between running evaluation on validation set')
|
||||
group.add_argument(
|
||||
'--eval-epoch',
|
||||
type=int,
|
||||
default=1,
|
||||
help='epoch between running evaluation on validation set')
|
||||
group.add_argument(
|
||||
'--eval-seq-length',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Maximum sequence length to process for '
|
||||
'evaluation. Defaults to `--seq-length`')
|
||||
group.add_argument(
|
||||
'--eval-max-preds-per-seq',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Maximum number of predictions to use for '
|
||||
'evaluation. Defaults to '
|
||||
'math.ceil(`--eval-seq-length`*.15/10)*10')
|
||||
group.add_argument(
|
||||
'--overlapping-eval',
|
||||
type=int,
|
||||
default=32,
|
||||
help='sliding window for overlapping eval ')
|
||||
# group.add_argument('--cloze-eval', action='store_true',
|
||||
# help='Evaluation dataset from `--valid-data` is a cloze task')
|
||||
group.add_argument(
|
||||
'--eval-hf',
|
||||
action='store_true',
|
||||
help='perform evaluation with huggingface openai model.'
|
||||
'use `--load` to specify weights path to be loaded')
|
||||
group.add_argument(
|
||||
'--load-openai',
|
||||
action='store_true',
|
||||
help='load openai weights into our model. Use `--load` '
|
||||
'to specify weights path to be loaded')
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def add_text_generate_args(parser):
|
||||
"""Text generate arguments."""
|
||||
|
||||
group = parser.add_argument_group('Text generation', 'configurations')
|
||||
group.add_argument('--temperature', type=float, default=1.0)
|
||||
group.add_argument('--top_p', type=float, default=0.0)
|
||||
group.add_argument('--top_k', type=int, default=0)
|
||||
group.add_argument('--num-beams', type=int, default=1)
|
||||
group.add_argument(
|
||||
'--out-seq-length', type=int, default=256) # yuandong512
|
||||
group.add_argument('--length-penalty', type=float, default=0.0)
|
||||
group.add_argument('--no-repeat-ngram-size', type=int, default=0)
|
||||
group.add_argument('--min-tgt-length', type=int, default=0)
|
||||
group.add_argument('--select-topk', action='store_true')
|
||||
group.add_argument('--blank-maskratio', type=float, default=0.1)
|
||||
group.add_argument(
|
||||
'--input-source',
|
||||
type=str,
|
||||
default='interactive',
|
||||
help='what input mode to use, interactive or path')
|
||||
group.add_argument(
|
||||
'--output-path',
|
||||
type=str,
|
||||
default='./samples',
|
||||
help='path to place the generated samples')
|
||||
group.add_argument(
|
||||
'--with-id',
|
||||
action='store_true',
|
||||
help='If each line is prepended with an id.')
|
||||
group.add_argument('--max-inference-batch-size', type=int, default=12)
|
||||
group.add_argument('--device', type=int, default=-1)
|
||||
return parser
|
||||
|
||||
|
||||
def add_data_args(parser):
|
||||
"""Train/valid/test data arguments."""
|
||||
|
||||
group = parser.add_argument_group('data', 'data configurations')
|
||||
|
||||
group.add_argument(
|
||||
'--model-parallel-size',
|
||||
type=int,
|
||||
default=1,
|
||||
help='size of the model parallel.')
|
||||
group.add_argument(
|
||||
'--shuffle',
|
||||
action='store_true',
|
||||
help='Shuffle data. Shuffling is deterministic '
|
||||
'based on seed and current epoch.')
|
||||
group.add_argument('--filter-english', action='store_true')
|
||||
group.add_argument(
|
||||
'--train-data',
|
||||
nargs='+',
|
||||
default=None,
|
||||
help='Whitespace separated filenames or corpora names '
|
||||
'for training.')
|
||||
group.add_argument(
|
||||
'--valid-data',
|
||||
nargs='*',
|
||||
default=None,
|
||||
help="""Filename for validation data.""")
|
||||
group.add_argument(
|
||||
'--test-data',
|
||||
nargs='*',
|
||||
default=None,
|
||||
help="""Filename for testing""")
|
||||
group.add_argument(
|
||||
'--data-dir',
|
||||
type=str,
|
||||
default=None,
|
||||
help='The data path to all the data files')
|
||||
group.add_argument(
|
||||
'--use-npy-data-loader',
|
||||
action='store_true',
|
||||
help='Use the numpy data loader. If set, then'
|
||||
'train-data-path, val-data-path, and test-data-path'
|
||||
'should also be provided.')
|
||||
group.add_argument(
|
||||
'--train-data-path',
|
||||
type=str,
|
||||
default='',
|
||||
help='path to the training data')
|
||||
group.add_argument(
|
||||
'--val-data-path',
|
||||
type=str,
|
||||
default='',
|
||||
help='path to the validation data')
|
||||
group.add_argument(
|
||||
'--test-data-path', type=str, default='', help='path to the test data')
|
||||
group.add_argument(
|
||||
'--input-data-sizes-file',
|
||||
type=str,
|
||||
default='sizes.txt',
|
||||
help='the filename containing all the shards sizes')
|
||||
|
||||
group.add_argument(
|
||||
'--delim', default=',', help='delimiter used to parse csv data files')
|
||||
group.add_argument(
|
||||
'--text-key',
|
||||
default='sentence',
|
||||
help='key to use to extract text from json/csv')
|
||||
group.add_argument(
|
||||
'--eval-text-key',
|
||||
default=None,
|
||||
help='key to use to extract text from '
|
||||
'json/csv evaluation datasets')
|
||||
group.add_argument(
|
||||
'--split',
|
||||
default='1000,1,1',
|
||||
help='comma-separated list of proportions for training,'
|
||||
' validation, and test split')
|
||||
|
||||
group.add_argument(
|
||||
'--no-lazy-loader',
|
||||
action='store_true',
|
||||
help='whether to lazy read the data set')
|
||||
group.add_argument('--half-lazy-loader', action='store_true')
|
||||
group.add_argument(
|
||||
'--loader-scatter',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Number of scatters to use for dataloaders')
|
||||
group.add_argument(
|
||||
'--lazy-loader',
|
||||
action='store_true',
|
||||
help='whether to lazy read the data set')
|
||||
group.add_argument(
|
||||
'--loose-json',
|
||||
action='store_true',
|
||||
help='Use loose json (one json-formatted string per '
|
||||
'newline), instead of tight json (data file is one '
|
||||
'json string)')
|
||||
group.add_argument(
|
||||
'--presplit-sentences',
|
||||
action='store_true',
|
||||
help='Dataset content consists of documents where '
|
||||
'each document consists of newline separated sentences')
|
||||
group.add_argument(
|
||||
'--num-workers',
|
||||
type=int,
|
||||
default=2,
|
||||
help="""Number of workers to use for dataloading""")
|
||||
|
||||
group.add_argument(
|
||||
'--block-size',
|
||||
type=int,
|
||||
default=10000,
|
||||
help="""Size of block to reduce memory in dataset""")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def add_generation_api_args(parser):
|
||||
"""generation api arguments"""
|
||||
|
||||
group = parser.add_argument_group('api', 'api configurations')
|
||||
|
||||
group.add_argument('--img_folder_path', default='image/')
|
||||
group.add_argument('--input_folder_path', default='input/')
|
||||
group.add_argument('--input_rec_path', default='input/')
|
||||
group.add_argument('--check_mode', default='code')
|
||||
group.add_argument('--time_interval', default=10)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def add_tokenization_args(parser):
|
||||
"""sparse attention arguments."""
|
||||
|
||||
group = parser.add_argument_group('Tokenization',
|
||||
'tokenization configurations')
|
||||
group.add_argument(
|
||||
'--tokenizer-model-type',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Model type to use for sentencepiece tokenization \
|
||||
(one of ['bpe', 'char', 'unigram', 'word']) or \
|
||||
bert vocab to use for BertWordPieceTokenizer (one of \
|
||||
['bert-large-uncased', 'bert-large-cased', etc.])")
|
||||
group.add_argument(
|
||||
'--tokenizer-path',
|
||||
type=str,
|
||||
default='tokenizer.model',
|
||||
help='path used to save/load sentencepiece tokenization '
|
||||
'models')
|
||||
group.add_argument(
|
||||
'--img-tokenizer-path',
|
||||
type=str,
|
||||
default=None,
|
||||
help='The checkpoint file path of image tokenizer.')
|
||||
group.add_argument(
|
||||
'--tokenizer-type',
|
||||
type=str,
|
||||
default=
|
||||
'ChineseSPTokenizer', # BertWordPieceTokenizer->ChineseSPTokenizer
|
||||
choices=[
|
||||
'CharacterLevelTokenizer', 'SentencePieceTokenizer',
|
||||
'BertWordPieceTokenizer', 'GPT2BPETokenizer', 'ChineseSPTokenizer',
|
||||
'glm_ChineseSPTokenizer'
|
||||
],
|
||||
help='what type of tokenizer to use')
|
||||
group.add_argument('--fix-command-token', action='store_true')
|
||||
group.add_argument('--not-pre-tokenize', action='store_true')
|
||||
group.add_argument(
|
||||
'--cache-dir',
|
||||
default='cache',
|
||||
type=str, # None->'cache'
|
||||
help='Where to store pre-trained BERT downloads')
|
||||
group.add_argument(
|
||||
'--use-tfrecords',
|
||||
action='store_true',
|
||||
help='load `--train-data`, `--valid-data`, '
|
||||
'`--test-data` from BERT tf records instead of '
|
||||
'normal data pipeline')
|
||||
group.add_argument(
|
||||
'--seq-length',
|
||||
type=int,
|
||||
default=512,
|
||||
help='Maximum sequence length to process')
|
||||
group.add_argument(
|
||||
'--mem-length',
|
||||
type=int,
|
||||
default=0,
|
||||
help='The memory length to preserve')
|
||||
group.add_argument(
|
||||
'--max-preds-per-seq',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Maximum number of predictions to use per sequence.'
|
||||
'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
|
||||
'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
|
||||
group.add_argument('--non-sentence-start', type=float, default=0.0)
|
||||
group.add_argument(
|
||||
'--sample-one-document',
|
||||
action='store_true',
|
||||
help='only sample one document in one sample')
|
||||
group.add_argument(
|
||||
'--load-splits',
|
||||
type=str,
|
||||
default=None,
|
||||
help='The path to load split indices from')
|
||||
group.add_argument(
|
||||
'--save-splits',
|
||||
type=str,
|
||||
default=None,
|
||||
help='The path to save split indices to')
|
||||
group.add_argument(
|
||||
'--save-test-data',
|
||||
type=str,
|
||||
default=None,
|
||||
help='The path to save the test data')
|
||||
group.add_argument(
|
||||
'--multi-task-data',
|
||||
nargs='*',
|
||||
default=None,
|
||||
help='Downsteam task names for multi-task pre-training')
|
||||
group.add_argument(
|
||||
'--multi-task-ratio',
|
||||
type=float,
|
||||
default=0.0,
|
||||
help='Ratio for multi-task pre-training')
|
||||
group.add_argument('--multi-seq-length', type=int, default=None)
|
||||
group.add_argument('--multi-batch-size', type=int, default=None)
|
||||
return parser
|
||||
|
||||
|
||||
def add_glm_args(parser):
|
||||
"""Arguments for GLM"""
|
||||
group = parser.add_argument_group('GLM', 'GLM Configurations')
|
||||
group.add_argument(
|
||||
'--block-lm',
|
||||
action='store_true',
|
||||
help='whether use the BlockLM pre-training')
|
||||
group.add_argument(
|
||||
'--masked-lm',
|
||||
action='store_true',
|
||||
help='whether to use the mlm objective')
|
||||
group.add_argument('--bert-prob', type=float, default=0.5)
|
||||
group.add_argument('--gpt-infill-prob', type=float, default=0.5)
|
||||
group.add_argument('--gpt-min-ratio', type=float, default=0.5)
|
||||
group.add_argument('--gap-sentence-prob', type=float, default=0.0)
|
||||
group.add_argument('--gap-sentence-ratio', type=float, default=0.15)
|
||||
group.add_argument('--avg-block-length', type=int, default=3)
|
||||
group.add_argument('--short-seq-prob', type=float, default=0.0)
|
||||
group.add_argument('--single-span-prob', type=float, default=0.0)
|
||||
group.add_argument(
|
||||
'--task-mask',
|
||||
action='store_true',
|
||||
help='Use different mask for generation and blank filling')
|
||||
group.add_argument(
|
||||
'--no-shuffle-block',
|
||||
action='store_true',
|
||||
help='not shuffle the blocks when filling the blank')
|
||||
group.add_argument(
|
||||
'--no-block-position',
|
||||
action='store_true',
|
||||
help='Use (rough) absolute positions instead of block positions')
|
||||
group.add_argument(
|
||||
'--sentinel-token',
|
||||
action='store_true',
|
||||
help='Use sentinel (mask) tokens to replace 2d position encoding')
|
||||
group.add_argument('--block-mask-prob', type=float, default=0.0)
|
||||
group.add_argument('--context-mask-ratio', type=float, default=0.0)
|
||||
group.add_argument(
|
||||
'--random-position',
|
||||
action='store_true',
|
||||
help='Use random start position to cover all the position embeddings')
|
||||
group.add_argument(
|
||||
'--old-checkpoint',
|
||||
action='store_true',
|
||||
help='Loading the checkpoint from old libraray')
|
||||
|
||||
group.add_argument(
|
||||
'--sampling-strategy',
|
||||
type=str,
|
||||
default='BaseStrategy',
|
||||
help='type name of sampling strategy')
|
||||
return parser
|
||||
|
||||
|
||||
def add_finetune_config_args(parser):
|
||||
group = parser.add_argument_group('finetune', 'finetune configurations')
|
||||
group.add_argument('--task', type=str, help='Task name.')
|
||||
group.add_argument(
|
||||
'--load-pretrained',
|
||||
type=str,
|
||||
help='Load pretrained model',
|
||||
default=
|
||||
'/root/yuandong_use/GR/glm_finetuned_model/blocklm-10B-kbqa_08-18-16-45'
|
||||
)
|
||||
# None->/root/yuandong_use/GR/glm_finetuned_model/blocklm-10B-kbqa_08-18-16-45
|
||||
group.add_argument(
|
||||
'--pool-token',
|
||||
type=str,
|
||||
choices=['start', 'pad', 'cls'],
|
||||
help='The token to pool the sequence representation',
|
||||
default='cls')
|
||||
group.add_argument(
|
||||
'--cloze-eval',
|
||||
action='store_true',
|
||||
help='Evaluation dataset with cloze task')
|
||||
group.add_argument(
|
||||
'--multi-token',
|
||||
action='store_true',
|
||||
help='Use multi token for cloze evaluation')
|
||||
group.add_argument(
|
||||
'--segment-length',
|
||||
type=int,
|
||||
default=0,
|
||||
help='The maximum segment length for cloze evaluation')
|
||||
group.add_argument(
|
||||
'--loss-func',
|
||||
type=str,
|
||||
choices=['cross_entropy', 'hinge', 'generative', 'mix'],
|
||||
default='cross_entropy')
|
||||
group.add_argument('--block-lm-ratio', type=float, default=0.0)
|
||||
group.add_argument(
|
||||
'--adapet',
|
||||
action='store_true',
|
||||
help='Use the decoupled cross entropy loss in AdaPET')
|
||||
group.add_argument('--pattern-id', type=int, default=0)
|
||||
group.add_argument(
|
||||
'--fast-decode',
|
||||
action='store_true',
|
||||
help=
|
||||
'Fast decode for multi-token cloze. Can only be used without checkpoint activation.'
|
||||
)
|
||||
group.add_argument('--few-superglue', action='store_true')
|
||||
group.add_argument(
|
||||
'--eval-valid',
|
||||
action='store_true',
|
||||
help='Whether evaluate on the valid set')
|
||||
group.add_argument('--validation-metric', type=str, default=None)
|
||||
group.add_argument(
|
||||
'--unidirectional',
|
||||
action='store_true',
|
||||
help='Use the left to right language model')
|
||||
group.add_argument('--src-seq-length', type=int, default=None)
|
||||
group.add_argument('--tgt-seq-length', type=int, default=None)
|
||||
group.add_argument('--adam-beta1', type=float, default=0.9)
|
||||
group.add_argument('--adam-beta2', type=float, default=0.999)
|
||||
group.add_argument('--adam-eps', type=float, default=1e-8)
|
||||
group.add_argument(
|
||||
'--optimizer', type=str, choices=['adam', 'adafactor'], default='adam')
|
||||
group.add_argument('--wsc-negative', action='store_true')
|
||||
group.add_argument('--overwrite', action='store_true')
|
||||
group.add_argument('--no-validation', action='store_true')
|
||||
# Continuous prompt arguments
|
||||
group.add_argument(
|
||||
'--continuous-prompt',
|
||||
action='store_true',
|
||||
help='Use continuous prompt for PET')
|
||||
group.add_argument('--num-prompt-tokens', type=int, default=0)
|
||||
group.add_argument(
|
||||
'--prompt-func', default='lstm', choices=['lstm', 'mlp', 'none'])
|
||||
group.add_argument(
|
||||
'--freeze-transformer', action='store_true', default=False)
|
||||
group.add_argument('--tune-prefix-layers', type=int, default=None)
|
||||
group.add_argument('--prefix-prompt', type=int, default=0)
|
||||
group.add_argument('--prompt-init', action='store_true', default=False)
|
||||
return parser
|
||||
|
||||
|
||||
def get_args():
|
||||
"""Parse all the args."""
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch BERT Model')
|
||||
parser = add_model_config_args(parser)
|
||||
parser = add_fp16_config_args(parser)
|
||||
parser = add_training_args(parser)
|
||||
parser = add_evaluation_args(parser)
|
||||
parser = add_data_args(parser)
|
||||
parser = add_tokenization_args(parser)
|
||||
parser = add_text_generate_args(parser)
|
||||
parser = add_generation_api_args(parser)
|
||||
parser = add_glm_args(parser)
|
||||
parser = add_finetune_config_args(parser)
|
||||
|
||||
# Include DeepSpeed configuration arguments
|
||||
parser = deepspeed.add_config_arguments(parser)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
if not args.train_data and not args.train_data_path:
|
||||
print('WARNING: No training data specified')
|
||||
|
||||
args.cuda = torch.cuda.is_available()
|
||||
|
||||
args.rank = int(os.getenv('RANK', '0'))
|
||||
args.world_size = int(os.getenv('WORLD_SIZE', '1'))
|
||||
if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
|
||||
mpi_define_env(args)
|
||||
if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
|
||||
# We are using (OpenMPI) mpirun for launching distributed data parallel processes
|
||||
local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
|
||||
local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
|
||||
|
||||
# Possibly running with Slurm
|
||||
num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
|
||||
nodeid = int(os.getenv('SLURM_NODEID', '0'))
|
||||
|
||||
args.local_rank = local_rank
|
||||
args.rank = nodeid * local_size + local_rank
|
||||
args.world_size = num_nodes * local_size
|
||||
|
||||
args.model_parallel_size = min(args.model_parallel_size, args.world_size)
|
||||
if args.rank == 0:
|
||||
print('using world size: {} and model-parallel size: {} '.format(
|
||||
args.world_size, args.model_parallel_size))
|
||||
|
||||
args.dynamic_loss_scale = False
|
||||
if args.loss_scale is None:
|
||||
args.dynamic_loss_scale = True
|
||||
if args.rank == 0:
|
||||
print(' > using dynamic loss scaling')
|
||||
|
||||
# The args fp32_* or fp16_* meant to be active when the
|
||||
# args fp16 is set. So the default behaviour should all
|
||||
# be false.
|
||||
if not args.fp16:
|
||||
args.fp32_embedding = False
|
||||
args.fp32_tokentypes = False
|
||||
args.fp32_layernorm = False
|
||||
|
||||
if hasattr(args, 'deepspeed'
|
||||
) and args.deepspeed and args.deepspeed_config is not None:
|
||||
with open(args.deepspeed_config) as file:
|
||||
deepspeed_config = json.load(file)
|
||||
if 'fp16' in deepspeed_config and deepspeed_config['fp16']['enabled']:
|
||||
args.fp16 = True
|
||||
else:
|
||||
args.fp16 = False
|
||||
if args.checkpoint_activations:
|
||||
args.deepspeed_activation_checkpointing = True
|
||||
if 'train_micro_batch_size_per_gpu' in deepspeed_config:
|
||||
args.batch_size = deepspeed_config[
|
||||
'train_micro_batch_size_per_gpu']
|
||||
if 'gradient_accumulation_steps' in deepspeed_config:
|
||||
args.gradient_accumulation_steps = deepspeed_config[
|
||||
'gradient_accumulation_steps']
|
||||
else:
|
||||
args.gradient_accumulation_steps = None
|
||||
if 'optimizer' in deepspeed_config:
|
||||
optimizer_params_config = deepspeed_config['optimizer'].get(
|
||||
'params', {})
|
||||
args.lr = optimizer_params_config.get('lr', args.lr)
|
||||
args.weight_decay = optimizer_params_config.get(
|
||||
'weight_decay', args.weight_decay)
|
||||
return args
|
||||
|
||||
|
||||
def mpi_define_env(args):
|
||||
from mpi4py import MPI
|
||||
comm = MPI.COMM_WORLD
|
||||
rank = comm.Get_rank()
|
||||
world_size = comm.Get_size()
|
||||
|
||||
master_addr = None
|
||||
if rank == 0:
|
||||
master_addr = get_hostname()
|
||||
master_addr = comm.bcast(master_addr, root=0)
|
||||
|
||||
# Determine local rank by assuming hostnames are unique
|
||||
proc_name = MPI.Get_processor_name()
|
||||
all_procs = comm.allgather(proc_name)
|
||||
local_rank = sum([i == proc_name for i in all_procs[:rank]])
|
||||
|
||||
os.environ['RANK'] = str(rank)
|
||||
os.environ['WORLD_SIZE'] = str(world_size)
|
||||
args.local_rank = local_rank
|
||||
args.world_size = world_size
|
||||
args.rank = rank
|
||||
os.environ['MASTER_ADDR'] = master_addr
|
||||
os.environ[
|
||||
'MASTER_PORT'] = '29500' # TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
|
||||
|
||||
print(
|
||||
'Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}'
|
||||
.format(os.environ['RANK'], args.local_rank, os.environ['WORLD_SIZE'],
|
||||
os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']))
|
||||
115
modelscope/models/nlp/txl_poem/com_utils/http_utils.py
Executable file
115
modelscope/models/nlp/txl_poem/com_utils/http_utils.py
Executable file
@@ -0,0 +1,115 @@
|
||||
# Copyright (c) 2022 Zhipu.AI
|
||||
import csv
|
||||
import traceback
|
||||
from io import StringIO
|
||||
from urllib import parse
|
||||
|
||||
from flask import Response, jsonify, request, send_file
|
||||
|
||||
|
||||
class APIException(Exception):
|
||||
|
||||
def __init__(self, message):
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class IllegalParamException(APIException):
|
||||
|
||||
def __init__(self, error):
|
||||
self.error = error
|
||||
super(IllegalParamException, self).__init__(error)
|
||||
|
||||
|
||||
class InputTooLongException(APIException):
|
||||
|
||||
def __init__(self, message, payload=None):
|
||||
self.payload = payload
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class CanNotReturnException(APIException):
|
||||
|
||||
def __init__(self, message, payload=None):
|
||||
self.payload = payload
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class MongoDBException(APIException):
|
||||
|
||||
def __init__(self, error):
|
||||
self.error = error
|
||||
super(MongoDBException, self).__init__(error)
|
||||
|
||||
|
||||
class MissParameterException(APIException):
|
||||
|
||||
def __init__(self, error):
|
||||
self.error = error
|
||||
super(MissParameterException, self).__init__(error)
|
||||
|
||||
|
||||
class HttpUtil:
|
||||
|
||||
@staticmethod
|
||||
def http_response(status=0, message='success', data=None, total=False):
|
||||
# if status and not isinstance(data, APIException):
|
||||
# sm.send_content(request.url_rule, traceback.format_exc(), request.data)
|
||||
if isinstance(data, Exception):
|
||||
data = str(data)
|
||||
r = {'status': status, 'message': message, 'result': data or []}
|
||||
if total and type(data) == list:
|
||||
if type(total) == int:
|
||||
r['total'] = total
|
||||
else:
|
||||
r['total'] = len(data)
|
||||
return jsonify(r)
|
||||
|
||||
@staticmethod
|
||||
def check_param(
|
||||
name,
|
||||
request, # noqa
|
||||
method=0,
|
||||
param_type=None,
|
||||
default=None,
|
||||
required=True):
|
||||
if method == 0:
|
||||
param = request.args.get(name)
|
||||
else:
|
||||
try:
|
||||
param = request.json.get(name)
|
||||
except Exception as e: # noqa
|
||||
raise IllegalParamException('data format json')
|
||||
|
||||
if param is None:
|
||||
if not required:
|
||||
return default
|
||||
raise IllegalParamException('param {} is required'.format(name))
|
||||
else:
|
||||
if param_type and type(param) != param_type:
|
||||
try:
|
||||
return param_type(param)
|
||||
except ValueError:
|
||||
raise IllegalParamException(
|
||||
'param {}: type wrong, not {}'.format(
|
||||
name, param_type))
|
||||
else:
|
||||
return param
|
||||
|
||||
@staticmethod
|
||||
def csv_file_response(data, filename):
|
||||
response = Response(HttpUtil.get_csv_stream(data), mimetype='text/csv')
|
||||
response.headers[
|
||||
'Content-Disposition'] = f'attachment; filename={parse.quote(filename)}.csv'
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def get_csv_stream(data):
|
||||
line = StringIO()
|
||||
csv_writer = csv.writer(line)
|
||||
csv_writer.writerow(['name', 'org', 'position', 'email', 'phone'])
|
||||
for p in data:
|
||||
csv_writer.writerow(
|
||||
[p['name'], p['aff'], p['position'], p['email'], p['phone']])
|
||||
res = line.getvalue()
|
||||
line.close()
|
||||
return res
|
||||
981
modelscope/models/nlp/txl_poem/fastpoem.py
Executable file
981
modelscope/models/nlp/txl_poem/fastpoem.py
Executable file
@@ -0,0 +1,981 @@
|
||||
# Copyright (c) 2022 Zhipu.AI
|
||||
"""Sample Generate GPT2"""
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import deepspeed
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from pypinyin import FINALS, FINALS_TONE, TONE3, pinyin
|
||||
|
||||
from .arguments import get_args
|
||||
from .com_utils.http_utils import (CanNotReturnException,
|
||||
InputTooLongException,
|
||||
MissParameterException)
|
||||
from .gpt2 import mpu
|
||||
from .gpt2.configure_data import configure_data
|
||||
from .gpt2.data_utils import make_tokenizer
|
||||
from .gpt2.fp16 import FP16_Module
|
||||
from .gpt2.model import DistributedDataParallel as DDP
|
||||
from .gpt2.model import GPT2Model
|
||||
from .gpt2.utils import (Timers, get_checkpoint_iteration, load_checkpoint,
|
||||
print_rank_0)
|
||||
|
||||
open_old_pronounce = 1
|
||||
|
||||
|
||||
def get_model(args):
|
||||
"""Build the model."""
|
||||
|
||||
print_rank_0('building GPT2 model ...')
|
||||
model = GPT2Model(
|
||||
num_layers=args.num_layers,
|
||||
vocab_size=args.vocab_size,
|
||||
hidden_size=args.hidden_size,
|
||||
num_attention_heads=args.num_attention_heads,
|
||||
embedding_dropout_prob=args.hidden_dropout,
|
||||
attention_dropout_prob=args.attention_dropout,
|
||||
output_dropout_prob=args.hidden_dropout,
|
||||
max_sequence_length=args.max_position_embeddings,
|
||||
max_memory_length=args.mem_length,
|
||||
checkpoint_activations=args.checkpoint_activations,
|
||||
checkpoint_num_layers=args.checkpoint_num_layers,
|
||||
parallel_output=True,
|
||||
relative_encoding=args.transformer_xl)
|
||||
|
||||
if mpu.get_data_parallel_rank() == 0:
|
||||
print(
|
||||
' > number of parameters on model parallel rank {}: {}'.format(
|
||||
mpu.get_model_parallel_rank(),
|
||||
sum([p.nelement() for p in model.parameters()])),
|
||||
flush=True)
|
||||
|
||||
# To prevent OOM for model sizes that cannot fit in GPU memory in full precision
|
||||
if hasattr(args, 'deepspeed') and args.deepspeed and args.fp16:
|
||||
model.half()
|
||||
|
||||
# GPU allocation.
|
||||
model.cuda(torch.cuda.current_device())
|
||||
|
||||
# Fp16 conversion.
|
||||
if args.fp16:
|
||||
model = FP16_Module(model)
|
||||
|
||||
# Wrap model for distributed training.
|
||||
if not args.deepspeed:
|
||||
if USE_TORCH_DDP:
|
||||
i = torch.cuda.current_device()
|
||||
model = DDP(
|
||||
model,
|
||||
device_ids=[i],
|
||||
output_device=i,
|
||||
process_group=mpu.get_data_parallel_group())
|
||||
else:
|
||||
model = DDP(model)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def get_masks_and_position_ids(data,
|
||||
eod_token,
|
||||
reset_position_ids,
|
||||
reset_attention_mask,
|
||||
loss_mask=None,
|
||||
attention_mask=None,
|
||||
transformer_xl=False,
|
||||
mem_length=None):
|
||||
# Extract batch size and sequence length.
|
||||
batch_size, seq_length = data.size()
|
||||
|
||||
# Attention mask (lower triangular).
|
||||
if transformer_xl:
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(
|
||||
(1, seq_length, seq_length + mem_length), device=data.device)
|
||||
attention_mask = torch.tril(
|
||||
torch.triu(attention_mask, 1 - seq_length + mem_length),
|
||||
mem_length)
|
||||
else:
|
||||
if reset_attention_mask:
|
||||
att_mask_batch = batch_size
|
||||
else:
|
||||
att_mask_batch = 1
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(
|
||||
(att_mask_batch, seq_length, seq_length), device=data.device)
|
||||
attention_mask = torch.tril(attention_mask)
|
||||
attention_mask = attention_mask.unsqueeze(1)
|
||||
|
||||
# Loss mask.
|
||||
if loss_mask is None:
|
||||
loss_mask = torch.ones(
|
||||
data.size(), dtype=torch.float, device=data.device)
|
||||
loss_mask[data == eod_token] = 0.0
|
||||
|
||||
# Position ids.
|
||||
position_ids = torch.arange(
|
||||
seq_length, dtype=torch.long, device=data.device)
|
||||
position_ids = position_ids.unsqueeze(0).expand_as(data)
|
||||
if not transformer_xl:
|
||||
# We need to clone as the ids will be modifed based on batch index.
|
||||
if reset_position_ids:
|
||||
position_ids = position_ids.clone()
|
||||
|
||||
if reset_position_ids or reset_attention_mask:
|
||||
# Loop through the batches:
|
||||
for b in range(batch_size):
|
||||
|
||||
# Find indecies where EOD token is.
|
||||
eod_index = position_ids[b, data[b] == eod_token]
|
||||
# Detach indecies from positions if going to modify positions.
|
||||
if reset_position_ids:
|
||||
eod_index = eod_index.clone()
|
||||
|
||||
# Loop through EOD indecies:
|
||||
prev_index = 0
|
||||
for j in range(eod_index.size()[0]):
|
||||
i = eod_index[j]
|
||||
# Mask attention loss.
|
||||
if reset_attention_mask:
|
||||
attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
|
||||
# Reset positions.
|
||||
if reset_position_ids:
|
||||
position_ids[b, (i + 1):] -= (i + 1 - prev_index)
|
||||
prev_index = i + 1
|
||||
|
||||
return attention_mask, loss_mask, position_ids
|
||||
|
||||
|
||||
def set_random_seed(seed):
|
||||
"""Set random seed for reproducability."""
|
||||
|
||||
if seed is not None and seed > 0:
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
mpu.model_parallel_cuda_manual_seed(seed)
|
||||
|
||||
|
||||
def initialize_distributed(args):
|
||||
"""Initialize torch.distributed."""
|
||||
|
||||
# Manually set the device ids.
|
||||
device = args.rank % torch.cuda.device_count()
|
||||
if args.local_rank is not None:
|
||||
device = args.local_rank
|
||||
torch.cuda.set_device(device)
|
||||
# Call the init process
|
||||
init_method = 'tcp://'
|
||||
# master_ip = os.getenv('MASTER_ADDR', 'localhost')
|
||||
master_ip = os.getenv('MASTER_ADDR', '127.0.0.1')
|
||||
master_port = os.getenv('MASTER_PORT', '6001')
|
||||
init_method += master_ip + ':' + master_port
|
||||
torch.distributed.init_process_group(
|
||||
backend=args.distributed_backend,
|
||||
world_size=args.world_size,
|
||||
rank=args.rank,
|
||||
init_method=init_method)
|
||||
|
||||
# Set the model-parallel / data-parallel communicators.
|
||||
mpu.initialize_model_parallel(args.model_parallel_size)
|
||||
|
||||
# Optional DeepSpeed Activation Checkpointing Features
|
||||
#
|
||||
if hasattr(
|
||||
args, 'deepspeed'
|
||||
) and args.deepspeed and args.deepspeed_activation_checkpointing:
|
||||
set_deepspeed_activation_checkpointing(args)
|
||||
|
||||
|
||||
def setup_model(args):
|
||||
"""Setup model and optimizer."""
|
||||
|
||||
model = get_model(args)
|
||||
|
||||
# if args.deepspeed:
|
||||
# print_rank_0("DeepSpeed is enabled.")
|
||||
#
|
||||
# model, _, _, _ = deepspeed.initialize(
|
||||
# model=model,
|
||||
# model_parameters=model.parameters(),
|
||||
# args=args,
|
||||
# mpu=mpu,
|
||||
# dist_init_required=False
|
||||
# )
|
||||
if args.load is not None:
|
||||
if args.deepspeed:
|
||||
iteration, release, success = get_checkpoint_iteration(args)
|
||||
print(args.load)
|
||||
path = os.path.join(args.load, 'mp_rank_00_model_states.pt')
|
||||
checkpoint = torch.load(path, map_location=torch.device('cpu'))
|
||||
model.load_state_dict(checkpoint['module'])
|
||||
else:
|
||||
_ = load_checkpoint(
|
||||
model, None, None, args, load_optimizer_states=False)
|
||||
# if args.deepspeed:
|
||||
# model = model.module
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def get_batch(context_tokens, device, args):
|
||||
tokens = context_tokens
|
||||
tokens = tokens.view(args.batch_size, -1).contiguous()
|
||||
tokens = tokens.to(device)
|
||||
|
||||
# Get the masks and postition ids.
|
||||
attention_mask, loss_mask, position_ids = get_masks_and_position_ids(
|
||||
tokens,
|
||||
args.eod_token,
|
||||
reset_position_ids=False,
|
||||
reset_attention_mask=False,
|
||||
transformer_xl=args.transformer_xl,
|
||||
mem_length=args.mem_length)
|
||||
|
||||
return tokens, attention_mask, position_ids
|
||||
|
||||
|
||||
def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
|
||||
# This function has been mostly taken from huggingface conversational ai code at
|
||||
# https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
|
||||
|
||||
if top_k > 0:
|
||||
# Remove all tokens with a probability less than the last token of the top-k
|
||||
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
|
||||
None]
|
||||
logits[indices_to_remove] = filter_value
|
||||
|
||||
if top_p > 0.0:
|
||||
# convert to 1D
|
||||
logits = logits.view(logits.size()[1]).contiguous()
|
||||
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
||||
cumulative_probs = torch.cumsum(
|
||||
F.softmax(sorted_logits, dim=-1), dim=-1)
|
||||
|
||||
# Remove tokens with cumulative probability above the threshold
|
||||
sorted_indices_to_remove = cumulative_probs > top_p
|
||||
# Shift the indices to the right to keep also the first token above the threshold
|
||||
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
|
||||
..., :-1].clone()
|
||||
sorted_indices_to_remove[..., 0] = 0
|
||||
indices_to_remove = sorted_indices[sorted_indices_to_remove]
|
||||
logits[indices_to_remove] = filter_value
|
||||
# going back to 2D
|
||||
logits = logits.view(1, -1).contiguous()
|
||||
|
||||
return logits
|
||||
|
||||
|
||||
rus = set([
|
||||
'八', '搭', '塌', '邋', '插', '察', '杀', '煞', '夹', '俠', '瞎', '辖', '狹', '匣', '黠',
|
||||
'鸭', '押', '压', '刷', '刮', '滑', '猾', '挖', '蜇', '舌', '鸽', '割', '胳', '搁', '瞌',
|
||||
'喝', '合', '盒', '盍', '曷', '貉', '涸', '劾', '核', '钵', '剝', '泼', '摸', '脱', '托',
|
||||
'捋', '撮', '缩', '豁', '活', '切', '噎', '汁', '织', '隻', '掷', '湿', '虱', '失', '十',
|
||||
'什', '拾', '实', '食', '蝕', '识', '石', '劈', '霹', '滴', '踢', '剔', '屐', '积', '激',
|
||||
'击', '漆', '吸', '息', '媳', '昔', '席', '锡', '檄', '觋', '揖', '一', '壹', '扑', '匍',
|
||||
'仆', '弗', '紱', '拂', '福', '蝠', '幅', '辐', '服', '伏', '茯', '督', '突', '秃', '俗',
|
||||
'出', '蜀', '窟', '哭', '忽', '惚', '斛', '鹄', '屋', '屈', '诎', '曲', '戌', '拍', '塞',
|
||||
'摘', '拆', '黑', '勺', '芍', '嚼', '粥', '妯', '熟', '白', '柏', '伯', '薄', '剥', '摸',
|
||||
'粥', '轴', '舳', '妯', '熟', '角', '削', '学'
|
||||
])
|
||||
ss = set([
|
||||
'de', 'te', 'le', 'ze', 'ce', 'se', 'fa', 'fo', 'dei', 'zei', 'gei', 'hei',
|
||||
'sei', 'bie', 'pie', 'mie', 'die', 'tie', 'nie', 'lie', 'kuo', 'zhuo',
|
||||
'chuo', 'shuo', 'ruo'
|
||||
])
|
||||
|
||||
|
||||
def checkpz(st, wd):
|
||||
|
||||
if not (st[-1] in ['1', '2', '3', '4']):
|
||||
return 0
|
||||
|
||||
if open_old_pronounce == 1:
|
||||
if wd in rus:
|
||||
return 2
|
||||
if wd in ['嗟', '瘸', '靴', '爹']:
|
||||
return 1
|
||||
if st[:-1] in ss:
|
||||
return 2
|
||||
|
||||
if (st[-1] == '2' and st[0] in ['b', 'd', 'g', 'j', 'z']):
|
||||
return 2
|
||||
if 'ue' in st:
|
||||
return 2
|
||||
|
||||
if st[-1] in ['1', '2']:
|
||||
return 1
|
||||
|
||||
return 2
|
||||
|
||||
|
||||
# inner rhy, must obey
|
||||
def checkrhyself(sentence):
|
||||
if len(sentence) == 0:
|
||||
return 0
|
||||
st = sentence
|
||||
fullst = False
|
||||
while (len(st) > 0 and st[-1] in [',', '。', ',', '?', '?', '!', '!']):
|
||||
st = st[:-1]
|
||||
fullst = True
|
||||
|
||||
l1 = pinyin(st, style=TONE3)
|
||||
if len(l1) < len(st):
|
||||
return 1
|
||||
for i in l1:
|
||||
if len(i[0]) < 2:
|
||||
return 1
|
||||
if len(st) <= 3:
|
||||
return 2
|
||||
|
||||
pz1 = checkpz(l1[1][0], sentence[1])
|
||||
|
||||
if len(st) >= 4:
|
||||
pz2 = checkpz(l1[3][0], sentence[3])
|
||||
if pz2 + pz1 != 3:
|
||||
return 1
|
||||
if len(st) >= 6:
|
||||
pz3 = checkpz(l1[5][0], sentence[5])
|
||||
if pz2 + pz3 != 3:
|
||||
return 1
|
||||
if fullst:
|
||||
if len(sentence) < 6:
|
||||
return 1
|
||||
pz11 = checkpz(l1[-3][0], st[-3])
|
||||
pz12 = checkpz(l1[-2][0], st[-2])
|
||||
pz13 = checkpz(l1[-1][0], st[-1])
|
||||
if (pz11 == pz12) and (pz12 == pz13):
|
||||
return 1
|
||||
|
||||
return 2
|
||||
|
||||
|
||||
def checkrhy(sentence, last, imp, req=0):
|
||||
|
||||
while (len(sentence) > 0
|
||||
and (sentence[-1] in [',', '。', ',', '?', '?', '!', '!'])):
|
||||
sentence = sentence[:-1]
|
||||
if len(sentence) == 0:
|
||||
return 0
|
||||
|
||||
while last[-1] in [',', '。', ',', '?', '?', '!', '!']:
|
||||
last = last[:-1]
|
||||
l1 = pinyin(sentence, style=TONE3)
|
||||
l2 = pinyin(last, style=TONE3)
|
||||
disobey = 0
|
||||
if len(l1) != len(sentence):
|
||||
return -1000
|
||||
for i in range(len(sentence)):
|
||||
if (i < len(l1)) and (i < len(l2)):
|
||||
st1 = checkpz(l1[i][0], sentence[i])
|
||||
|
||||
sr1 = checkpz(l2[i][0], last[i])
|
||||
if (req == 1 and i % 2 == 1):
|
||||
st1 = 3 - st1
|
||||
|
||||
if st1 + sr1 != 3:
|
||||
if req == 0:
|
||||
disobey += 0.35
|
||||
if i % 2 == 1:
|
||||
disobey += 0.35
|
||||
if req == 1:
|
||||
disobey += 0.2
|
||||
if i == len(l2) - 1:
|
||||
disobey += 0.65
|
||||
if req == 1:
|
||||
disobey += 0.35
|
||||
|
||||
disobey *= imp
|
||||
disobey = -5 * disobey / len(l2)
|
||||
for i in range(len(l1)):
|
||||
for j in range(i + 2, len(l1)):
|
||||
if l1[i][0][:-1] == l1[j][0][:-1]:
|
||||
disobey -= 7 / len(l1)
|
||||
return disobey
|
||||
|
||||
|
||||
def checksentence(sentence,
|
||||
original_context,
|
||||
min_length,
|
||||
max_length,
|
||||
endnote,
|
||||
curvote=0,
|
||||
yayun=None):
|
||||
|
||||
if '<|end' in sentence:
|
||||
return 1
|
||||
|
||||
if '的' in sentence:
|
||||
return 1
|
||||
if len(sentence) == 0:
|
||||
return 1
|
||||
if ((len(sentence) > max_length and not (sentence[-1] in endnote))
|
||||
or len(sentence) == 0) or len(sentence) > max_length + 1:
|
||||
return 1
|
||||
if (sentence[-1] in endnote) and ((len(sentence) <= min_length) or # noqa
|
||||
(len(sentence) == 7)): # noqa
|
||||
return 1
|
||||
|
||||
if (sentence[-1] in endnote) and (sentence[:-1] in original_context):
|
||||
return 1
|
||||
|
||||
mdisobey = 0 # noqa
|
||||
illegal_notes = [
|
||||
' ', ':', '《', '》', '‘', '“', '-', '——', '⁇', '[', '【', '】', ']', '.',
|
||||
'、', '(', '(', ')', ')', '·'
|
||||
]
|
||||
if '。' in endnote:
|
||||
illegal_notes.extend([',', ','])
|
||||
else:
|
||||
illegal_notes.append('。')
|
||||
for i in range(10):
|
||||
illegal_notes.append(str(i))
|
||||
for i in range(64, 123):
|
||||
illegal_notes.append(chr(i))
|
||||
for note in illegal_notes:
|
||||
if note in sentence:
|
||||
return 1
|
||||
last = getlastsentence(original_context)
|
||||
if min_length == max_length:
|
||||
imp = 1
|
||||
if (',' in last) or (',' in last):
|
||||
imp = 1.5
|
||||
|
||||
if curvote == 0:
|
||||
rt = checkrhy(sentence, last, imp, req=1)
|
||||
else:
|
||||
rt = checkrhy(sentence, last, imp)
|
||||
if rt < -0.75:
|
||||
return 1
|
||||
|
||||
for i in range(len(sentence)):
|
||||
if min_length == max_length:
|
||||
if (i < len(last) - 1) and (sentence[i] == last[i]):
|
||||
return 1
|
||||
|
||||
if i < len(sentence) - 1:
|
||||
if sentence[i:i + 2] in original_context:
|
||||
return 1
|
||||
if sentence[i:i + 2] in sentence[:i]:
|
||||
return 1
|
||||
|
||||
if checkrhyself(sentence) == 1:
|
||||
return 1
|
||||
cc = curvote
|
||||
if yayun is None:
|
||||
cc = 0
|
||||
if (cc == 1 and len(sentence) >= max_length):
|
||||
|
||||
final1 = pinyin(sentence, style=FINALS)
|
||||
if len(final1) < max_length:
|
||||
return 1
|
||||
final1 = final1[max_length - 1][0]
|
||||
final2 = pinyin(yayun, style=FINALS)[-1][0]
|
||||
group = [['a', 'ia', 'ua'], ['ai', 'uai', 'ei', 'ui', 'uei'],
|
||||
['an', 'uan', 'ian'], ['ie', 'ue', 've'], ['ou', 'iu', 'iou'],
|
||||
['ang', 'iang', 'uang'], ['ao', 'iao'], ['e', 'o', 'uo'],
|
||||
['en', 'un', 'uen', 'ong', 'iong', 'in', 'ing', 'er']]
|
||||
doc = 0
|
||||
if final1 == final2:
|
||||
doc = 1
|
||||
for i in group:
|
||||
if (final1 in i) and (final2 in i):
|
||||
doc = 1
|
||||
if doc == 0:
|
||||
return 1
|
||||
|
||||
if (sentence[-1] in endnote):
|
||||
return 0
|
||||
|
||||
return 2
|
||||
|
||||
|
||||
def generate_sentence(model,
|
||||
tokenizer,
|
||||
args,
|
||||
device,
|
||||
current_tokens,
|
||||
mems,
|
||||
endnote=[',', ',', '?', '?'],
|
||||
num_candidates=1,
|
||||
min_length=5,
|
||||
max_length=7,
|
||||
yayun=None):
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
mct_tree = []
|
||||
if mems == []:
|
||||
mems = []
|
||||
tokens, attention_mask, position_ids = get_batch(
|
||||
current_tokens, device, args)
|
||||
logits, *rts = model(tokens, position_ids, attention_mask, *mems)
|
||||
else:
|
||||
tokens = current_tokens
|
||||
index = len(tokens[0])
|
||||
logits, *rts = model(
|
||||
tokens[:, index - 1:index],
|
||||
tokens.new_ones((1, 1)) * (index - 1),
|
||||
tokens.new_ones(
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
args.mem_length + 1,
|
||||
device=tokens.device,
|
||||
dtype=torch.float), *mems)
|
||||
|
||||
output_tokens_list = tokens.view(-1).contiguous()
|
||||
original_context = tokenizer.DecodeIds(output_tokens_list.tolist())
|
||||
context_length = len(tokens[0])
|
||||
logits = logits[0, -1]
|
||||
mct_tree.append([
|
||||
logits, rts, tokens, -np.ones(len(logits)),
|
||||
torch.ones(len(logits)).cuda(), 0
|
||||
])
|
||||
final_result = []
|
||||
nextid = 0
|
||||
tries = 0
|
||||
max_tries = num_candidates * 30
|
||||
curvote = 1
|
||||
if ',' in endnote:
|
||||
curvote = 0
|
||||
if ',' in endnote:
|
||||
endid = 43359
|
||||
else:
|
||||
endid = 43361
|
||||
dpcount = 0
|
||||
|
||||
tmp = args.temperature
|
||||
|
||||
while ((len(final_result) < num_candidates) and (tries < max_tries)
|
||||
and (tries < 1000)):
|
||||
currentid = nextid
|
||||
tries += 1
|
||||
while currentid != -1:
|
||||
tc = torch.log(mct_tree[currentid][4])
|
||||
tc = tc + F.relu(tc - 10) * 1000
|
||||
logits = mct_tree[currentid][0].view(-1) - tc * 0.5
|
||||
logits = logits[:50001]
|
||||
log_probs = F.softmax(logits, dim=-1)
|
||||
|
||||
pr = torch.multinomial(log_probs, num_samples=1)[0]
|
||||
prev = pr.item()
|
||||
mct_tree[currentid][4][prev] += 1
|
||||
lastid = currentid
|
||||
currentid = int(mct_tree[currentid][3][prev])
|
||||
# start from lastid & currentid
|
||||
|
||||
cqs = mct_tree[lastid][2]
|
||||
tokens = torch.cat((cqs, pr.unsqueeze(0).view(1, 1)), dim=1)
|
||||
output_tokens_list = tokens.view(-1).contiguous()
|
||||
sentence = tokenizer.DecodeIds(
|
||||
output_tokens_list[context_length:].tolist())
|
||||
logit = mct_tree[lastid][0]
|
||||
log_probs = F.softmax(logit, dim=-1)
|
||||
log_pbs = torch.log(log_probs)
|
||||
score = log_pbs[prev].item()
|
||||
nextid = 0
|
||||
ip = checksentence(
|
||||
sentence,
|
||||
original_context,
|
||||
min_length,
|
||||
max_length,
|
||||
endnote,
|
||||
curvote=curvote,
|
||||
yayun=yayun)
|
||||
for j in final_result:
|
||||
if j[0] == sentence:
|
||||
ip = 1
|
||||
if ('<|end' in sentence) and ('<|end' in j[0]):
|
||||
ip = 1
|
||||
|
||||
score = mct_tree[lastid][5] + score
|
||||
if (ip == 1):
|
||||
nextid = lastid
|
||||
dpcount += 1
|
||||
max_tries += 1
|
||||
if (dpcount >= 50) or (dpcount >= 8
|
||||
and len(sentence) < max_length):
|
||||
nextid = 0
|
||||
dpcount = 0
|
||||
mct_tree[lastid][4][prev] = 100000
|
||||
continue
|
||||
dpcount = 0
|
||||
if (ip == 0):
|
||||
mct_tree[lastid][4][prev] = 100000
|
||||
yay = yayun
|
||||
if curvote == 1:
|
||||
yay = sentence[-2]
|
||||
|
||||
final_result.append([
|
||||
copy.deepcopy(sentence),
|
||||
copy.deepcopy(score),
|
||||
copy.deepcopy(tokens),
|
||||
copy.deepcopy(mct_tree[lastid][1]), yay
|
||||
])
|
||||
continue
|
||||
|
||||
mct_tree[lastid][3][prev] = len(mct_tree)
|
||||
tmp = args.temperature
|
||||
if (len(sentence) >= 4
|
||||
or (len(sentence) == 3 and max_length == 5)):
|
||||
tmp = tmp * 0.6
|
||||
rts = mct_tree[lastid][1]
|
||||
index = len(tokens[0])
|
||||
|
||||
logits, *rts = model(
|
||||
tokens[:, index - 1:index],
|
||||
tokens.new_ones((1, 1)) * (index - 1),
|
||||
tokens.new_ones(
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
args.mem_length + 1,
|
||||
device=tokens.device,
|
||||
dtype=torch.float), *rts)
|
||||
logits = logits[0, -1] / tmp
|
||||
if len(sentence) == max_length:
|
||||
logits[endid] += 10
|
||||
mct_tree.append([
|
||||
logits, rts, tokens, -np.ones(len(logits)),
|
||||
torch.ones(len(logits)).cuda(), score
|
||||
])
|
||||
nextid = len(mct_tree) - 1
|
||||
del mct_tree
|
||||
torch.cuda.empty_cache()
|
||||
res = {}
|
||||
res['output_tokens_length'] = len(output_tokens_list)
|
||||
res['result'] = final_result
|
||||
return res
|
||||
|
||||
|
||||
def getlength(str):
|
||||
w = str.replace('。', ',').replace(',', ',').replace('?', ',').replace(
|
||||
'?', ',').replace(' ', ',').replace('!',
|
||||
',').replace('!', ',').replace(
|
||||
':', ',').replace(' ', '')
|
||||
sp = w.split(',')
|
||||
|
||||
return len(sp[-2])
|
||||
|
||||
|
||||
def getlastsentence(str):
|
||||
w = str.replace('。', ',').replace(',', ',').replace('?', ',').replace(
|
||||
'?', ',').replace(' ', ',').replace('!',
|
||||
',').replace('!', ',').replace(
|
||||
':', ',').replace(' ', '')
|
||||
sp = w.split(',')
|
||||
fom = sp[-1]
|
||||
if len(fom) == 0:
|
||||
fom = sp[-2]
|
||||
return fom + str[-1]
|
||||
|
||||
|
||||
def generate_string(model,
|
||||
tokenizer,
|
||||
args,
|
||||
device,
|
||||
title,
|
||||
author,
|
||||
desc=None,
|
||||
length=None,
|
||||
st=None,
|
||||
lycr=5,
|
||||
senlength=4):
|
||||
lycr_str = ''
|
||||
senlength_str = ''
|
||||
if lycr == 5:
|
||||
lycr_str = '诗体:五言'
|
||||
else:
|
||||
lycr_str = '诗体:七言'
|
||||
if senlength == 4:
|
||||
senlength_str = '格律:绝句'
|
||||
else:
|
||||
senlength_str = '格律:律诗'
|
||||
input_str = title + ' 作者:' + author + ' 体裁:诗歌' + lycr_str + senlength_str + '题名:' + title + ' 正文: ' # noqa
|
||||
if desc is not None:
|
||||
input_str = title + ' 作者:' + author + ' 体裁:诗歌' + lycr_str + senlength_str + '描述:' + desc + ' 题名:' + title + ' 正文: ' # noqa
|
||||
input_len = len(input_str) # noqa
|
||||
context_count = 0 # noqa
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
context_tokens = tokenizer.EncodeAsIds(input_str).tokenization
|
||||
eo_tokens = tokenizer.EncodeAsIds('<|endoftext|>').tokenization
|
||||
context_length = len(context_tokens)
|
||||
if context_length >= args.seq_length:
|
||||
res = {}
|
||||
res['prompt_token_num'] = 0
|
||||
res['completion_token_num'] = 0
|
||||
res['text'] = ''
|
||||
res['errmsg'] = 'the text you entered is too long, please reduce the number of characters'
|
||||
raise InputTooLongException(
|
||||
'the text you entered is too long, please reduce the number of characters',
|
||||
res)
|
||||
|
||||
context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
|
||||
eo_token_tensor = torch.cuda.LongTensor(eo_tokens) # noqa
|
||||
context_length_tensor = torch.cuda.LongTensor([context_length])
|
||||
context_length = context_length_tensor[0].item()
|
||||
|
||||
start_time = time.time() # noqa
|
||||
|
||||
counter, mems = 0, [] # noqa
|
||||
org_context_length = context_length # noqa
|
||||
completion_token_length = context_length
|
||||
beam_size = 1
|
||||
beam_candidate = 1
|
||||
beam_max = 1 # noqa
|
||||
max_headings = 4 # noqa
|
||||
final_storage = [] # noqa
|
||||
final_storage_score = [] # noqa
|
||||
step = senlength + 1
|
||||
if st is None:
|
||||
st = 8
|
||||
overall_score = []
|
||||
past_beam_id = []
|
||||
|
||||
if length is not None:
|
||||
res = generate_sentence(
|
||||
model,
|
||||
tokenizer,
|
||||
args,
|
||||
device,
|
||||
context_tokens_tensor, [],
|
||||
min_length=lycr - 1,
|
||||
max_length=lycr,
|
||||
num_candidates=beam_size)
|
||||
beam_sentences = res.get('result', [])
|
||||
completion_token_length = res.get('output_tokens_length', 0)
|
||||
else:
|
||||
res = generate_sentence(
|
||||
model,
|
||||
tokenizer,
|
||||
args,
|
||||
device,
|
||||
context_tokens_tensor, [],
|
||||
min_length=lycr - 1,
|
||||
max_length=lycr,
|
||||
num_candidates=beam_size)
|
||||
beam_sentences = res.get('result', [])
|
||||
completion_token_length = res.get('output_tokens_length', 0)
|
||||
|
||||
if len(beam_sentences) == 0:
|
||||
res = {}
|
||||
res['prompt_token_num'] = context_length
|
||||
res['completion_token_num'] = 0
|
||||
res['text'] = ''
|
||||
res['errmsg'] = '太难了,写不出来。'
|
||||
raise CanNotReturnException('太难了,写不出来。', res)
|
||||
|
||||
for i in range(step):
|
||||
beam_new_sentences = []
|
||||
|
||||
endnote = [',', ',', '?', '?']
|
||||
if i % 2 == 0:
|
||||
endnote = ['。', '?', '?', '!', '!']
|
||||
overall_score = [] # noqa
|
||||
past_beam_id = [] # noqa
|
||||
id = 0
|
||||
current_sentence = input_str + beam_sentences[0][0]
|
||||
|
||||
ini_score = beam_sentences[id][1] # noqa
|
||||
token_tensor = beam_sentences[id][2]
|
||||
mems = beam_sentences[id][3]
|
||||
|
||||
len_sentence = getlength(beam_sentences[id][0]) # noqa
|
||||
|
||||
res = generate_sentence(
|
||||
model,
|
||||
tokenizer,
|
||||
args,
|
||||
device,
|
||||
token_tensor,
|
||||
mems,
|
||||
num_candidates=beam_candidate,
|
||||
endnote=endnote,
|
||||
min_length=lycr - 1,
|
||||
max_length=lycr,
|
||||
yayun=beam_sentences[id][-1])
|
||||
gen = res.get('result', [])
|
||||
completion_token_length = res.get('output_tokens_length', 0)
|
||||
if len(gen) == 0:
|
||||
res = {}
|
||||
res['prompt_token_num'] = context_length
|
||||
res['completion_token_num'] = context_length
|
||||
res['text'] = ''
|
||||
res['errmsg'] = '太难了,写不出来。'
|
||||
raise CanNotReturnException('太难了,写不出来。', res)
|
||||
jj = gen[0]
|
||||
if ('<|end' in jj[0] or i == senlength - 1):
|
||||
if (i % 2 == 1 and i > -3):
|
||||
del beam_sentences
|
||||
del beam_new_sentences
|
||||
torch.cuda.empty_cache()
|
||||
res = {}
|
||||
res['prompt_token_num'] = context_length
|
||||
res['completion_token_num'] = completion_token_length
|
||||
res['text'] = current_sentence
|
||||
return res
|
||||
else:
|
||||
res = generate_sentence(
|
||||
model,
|
||||
tokenizer,
|
||||
args,
|
||||
device,
|
||||
token_tensor,
|
||||
mems,
|
||||
num_candidates=beam_candidate,
|
||||
endnote=endnote,
|
||||
min_length=lycr - 1,
|
||||
max_length=lycr,
|
||||
yayun=beam_sentences[id][-1])
|
||||
gen = res.get('result', [])
|
||||
completion_token_length = res.get('output_tokens_length',
|
||||
0)
|
||||
|
||||
if len(gen) == 0:
|
||||
res = {}
|
||||
res['prompt_token_num'] = context_length
|
||||
res['completion_token_num'] = 0
|
||||
res['text'] = ''
|
||||
res['errmsg'] = '太难了,写不出来。'
|
||||
raise CanNotReturnException('太难了,写不出来。', res)
|
||||
st = jj[0]
|
||||
# experiment shows that this is better universal,
|
||||
|
||||
jj[0] = beam_sentences[id][0] + jj[0]
|
||||
jj[1] = 0
|
||||
beam_new_sentences.append(jj)
|
||||
del beam_sentences
|
||||
torch.cuda.empty_cache()
|
||||
beam_sentences = beam_new_sentences
|
||||
|
||||
# parallel ends
|
||||
|
||||
del beam_sentences
|
||||
del beam_new_sentences
|
||||
torch.cuda.empty_cache()
|
||||
res = {}
|
||||
res['prompt_token_num'] = context_length
|
||||
res['completion_token_num'] = 0
|
||||
res['text'] = ''
|
||||
res['errmsg'] = '太难了,写不出来。'
|
||||
raise CanNotReturnException('太难了,写不出来。', res)
|
||||
|
||||
|
||||
def prepare_tokenizer(args):
|
||||
tokenizer_args = {
|
||||
'tokenizer_type': args.tokenizer_type,
|
||||
'corpus': None,
|
||||
'model_path': args.tokenizer_path,
|
||||
'vocab_size': args.vocab_size,
|
||||
'model_type': args.tokenizer_model_type,
|
||||
'cache_dir': args.cache_dir
|
||||
}
|
||||
tokenizer = make_tokenizer(**tokenizer_args)
|
||||
|
||||
num_tokens = tokenizer.num_tokens
|
||||
before = num_tokens
|
||||
after = before
|
||||
multiple = args.make_vocab_size_divisible_by * \
|
||||
mpu.get_model_parallel_world_size() # noqa
|
||||
while (after % multiple) != 0:
|
||||
after += 1
|
||||
print_rank_0('> padded vocab (size: {}) with {} dummy '
|
||||
'tokens (new size: {})'.format(before, after - before, after))
|
||||
|
||||
args.tokenizer_num_tokens = after
|
||||
args.tokenizer_num_type_tokens = tokenizer.num_type_tokens
|
||||
args.eod_token = tokenizer.get_command('eos').Id
|
||||
|
||||
args.vocab_size = after
|
||||
print('prepare tokenizer done', flush=True)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def set_args():
|
||||
args = get_args()
|
||||
args.deepspeed = True
|
||||
args.num_nodes = 1
|
||||
args.num_gpus = 1
|
||||
args.model_parallel_size = 1
|
||||
args.num_layers = 32
|
||||
args.hidden_size = 2560
|
||||
args.load = 'modelscope-txl/'
|
||||
args.num_attention_heads = 32
|
||||
args.max_position_embeddings = 1024
|
||||
args.tokenizer_type = 'ChineseSPTokenizer'
|
||||
args.cache_dir = 'cache'
|
||||
args.fp16 = True
|
||||
args.out_seq_length = 180
|
||||
args.seq_length = 200
|
||||
args.mem_length = 256
|
||||
args.transformer_xl = True
|
||||
args.temperature = 1.2
|
||||
args.top_k = 0
|
||||
args.top_p = 0
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def prepare_model(model_dir):
|
||||
"""Main training program."""
|
||||
|
||||
# Disable CuDNN.
|
||||
torch.backends.cudnn.enabled = False
|
||||
|
||||
# Timer.
|
||||
timers = Timers() # noqa
|
||||
|
||||
# Arguments.
|
||||
args = set_args()
|
||||
args.load = model_dir
|
||||
args.mem_length = args.seq_length + args.mem_length - 1
|
||||
|
||||
# Pytorch distributed.
|
||||
initialize_distributed(args)
|
||||
|
||||
# Random seeds for reproducability.
|
||||
args.seed = random.randint(0, 1000000)
|
||||
set_random_seed(args.seed)
|
||||
|
||||
# get the tokenizer
|
||||
args.tokenizer_path = model_dir
|
||||
tokenizer = prepare_tokenizer(args)
|
||||
|
||||
# Model, optimizer, and learning rate.
|
||||
model = setup_model(args)
|
||||
|
||||
# setting default batch size to 1
|
||||
args.batch_size = 1
|
||||
|
||||
# generate samples
|
||||
return model, tokenizer, args
|
||||
|
||||
|
||||
def fast_poem(content, model, tokenizer, args):
|
||||
title = content['title']
|
||||
author = content['author']
|
||||
desc = content['desc']
|
||||
lycr = content['lycr']
|
||||
senlength = content['senlength']
|
||||
|
||||
res = generate_string(
|
||||
model,
|
||||
tokenizer,
|
||||
args,
|
||||
torch.cuda.current_device(),
|
||||
title,
|
||||
author,
|
||||
desc=desc,
|
||||
lycr=lycr,
|
||||
senlength=senlength)
|
||||
|
||||
return res
|
||||
263
modelscope/models/nlp/txl_poem/gpt2/configure_data.py
Executable file
263
modelscope/models/nlp/txl_poem/gpt2/configure_data.py
Executable file
@@ -0,0 +1,263 @@
|
||||
# Modified by Zhipu.AI
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""parses arguments and preps data loader"""
|
||||
|
||||
import copy
|
||||
|
||||
import torch
|
||||
|
||||
from . import data_utils, mpu
|
||||
|
||||
|
||||
class DataConfig:
|
||||
|
||||
def __init__(self, defaults={}):
|
||||
super(DataConfig, self).__init__()
|
||||
self.defaults = defaults
|
||||
|
||||
def apply(self, args):
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('configuring data')
|
||||
self.apply_defaults(args)
|
||||
return make_loaders(args)
|
||||
|
||||
def set_defaults(self, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
self.defaults[k] = v
|
||||
|
||||
def apply_defaults(self, args):
|
||||
for k, v in self.defaults.items():
|
||||
k = k.replace('-', '_')
|
||||
if not hasattr(args, k):
|
||||
setattr(args, k, v)
|
||||
|
||||
|
||||
def make_data_loader(dataset, batch_size, args):
|
||||
world_size = torch.distributed.get_world_size(
|
||||
group=mpu.get_data_parallel_group())
|
||||
rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
|
||||
distributed = world_size > 1
|
||||
if args.transformer_xl:
|
||||
batch_sampler = data_utils.samplers.DistributedSequentialSampler(
|
||||
len(dataset), args.train_iters, batch_size, rank, world_size)
|
||||
else:
|
||||
shuffle = args.shuffle
|
||||
if shuffle:
|
||||
sampler = data_utils.samplers.RandomSampler(
|
||||
dataset,
|
||||
replacement=True,
|
||||
num_samples=batch_size * args.train_iters)
|
||||
else:
|
||||
sampler = torch.utils.data.SequentialSampler(dataset)
|
||||
drop_last = distributed
|
||||
# the GPUs in the same model parallel group receive the same data
|
||||
if distributed:
|
||||
batch_sampler = data_utils.samplers.DistributedBatchSampler(
|
||||
sampler,
|
||||
batch_size,
|
||||
drop_last,
|
||||
rank,
|
||||
world_size,
|
||||
gradient_accumulation_steps=args.gradient_accumulation_steps)
|
||||
else:
|
||||
batch_sampler = torch.utils.data.BatchSampler(
|
||||
sampler, batch_size, drop_last)
|
||||
data_loader = torch.utils.data.DataLoader(
|
||||
dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
num_workers=args.num_workers,
|
||||
pin_memory=True)
|
||||
|
||||
return data_loader
|
||||
|
||||
|
||||
def make_tfrecord_loaders(args):
|
||||
"""Load train/val/test dataset from shuffled TFRecords"""
|
||||
|
||||
import data_utils.tf_dl
|
||||
data_set_args = {
|
||||
'batch_size': args.batch_size,
|
||||
'max_seq_len': args.seq_length,
|
||||
'max_preds_per_seq': args.max_preds_per_seq,
|
||||
'train': True,
|
||||
'num_workers': max(args.num_workers, 1),
|
||||
'seed': args.seed + args.rank + 1,
|
||||
'threaded_dl': args.num_workers > 0
|
||||
}
|
||||
train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
|
||||
**data_set_args)
|
||||
data_set_args['train'] = False
|
||||
if args.eval_seq_length is not None:
|
||||
data_set_args['max_seq_len'] = args.eval_seq_length
|
||||
if args.eval_max_preds_per_seq is not None:
|
||||
data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
|
||||
valid = None
|
||||
if args.valid_data is not None:
|
||||
valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
|
||||
**data_set_args)
|
||||
test = None
|
||||
if args.test_data is not None:
|
||||
test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
|
||||
**data_set_args)
|
||||
tokenizer = data_utils.make_tokenizer(
|
||||
args.tokenizer_type,
|
||||
train,
|
||||
args.tokenizer_path,
|
||||
args.vocab_size,
|
||||
args.tokenizer_model_type,
|
||||
cache_dir=args.cache_dir)
|
||||
|
||||
return (train, valid, test), tokenizer
|
||||
|
||||
|
||||
def make_loaders(args):
|
||||
"""makes training/val/test"""
|
||||
|
||||
if args.use_tfrecords:
|
||||
return make_tfrecord_loaders(args)
|
||||
world_size = torch.distributed.get_world_size(
|
||||
group=mpu.get_data_parallel_group())
|
||||
batch_size = args.batch_size * world_size
|
||||
eval_batch_size = batch_size
|
||||
if args.eval_batch_size is not None:
|
||||
eval_batch_size = args.eval_batch_size * world_size
|
||||
seq_length = args.seq_length
|
||||
if seq_length < 0:
|
||||
seq_length = seq_length * world_size
|
||||
eval_seq_length = args.eval_seq_length
|
||||
if eval_seq_length is not None and eval_seq_length < 0:
|
||||
eval_seq_length = eval_seq_length * world_size
|
||||
split = get_split(args)
|
||||
data_set_args = {
|
||||
'local_rank': args.local_rank,
|
||||
'path': args.train_data,
|
||||
'seq_length': seq_length,
|
||||
'mem_length': args.mem_length,
|
||||
'lazy': args.lazy_loader,
|
||||
'xl_style': args.transformer_xl,
|
||||
'delim': args.delim,
|
||||
'text_key': args.text_key,
|
||||
'label_key': 'label',
|
||||
'non_binary_cols': None,
|
||||
'ds_type': args.data_set_type,
|
||||
'split': split,
|
||||
'loose': args.loose_json,
|
||||
'tokenizer_type': args.tokenizer_type,
|
||||
'tokenizer_model_path': args.tokenizer_path,
|
||||
'vocab_size': args.vocab_size,
|
||||
'model_type': args.tokenizer_model_type,
|
||||
'cache_dir': args.cache_dir,
|
||||
'max_preds_per_seq': args.max_preds_per_seq,
|
||||
'presplit_sentences': args.presplit_sentences,
|
||||
'sample_one_document': args.sample_one_document,
|
||||
'pre_tokenize': not args.not_pre_tokenize
|
||||
}
|
||||
|
||||
eval_set_args = copy.copy(data_set_args)
|
||||
eval_set_args['split'] = [1.]
|
||||
# if optional eval args were set then replace their
|
||||
# equivalent values in the arg dict
|
||||
if eval_seq_length:
|
||||
eval_set_args['seq_length'] = eval_seq_length
|
||||
if args.eval_max_preds_per_seq:
|
||||
eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
|
||||
if args.eval_text_key is not None:
|
||||
eval_set_args['text_key'] = args.eval_text_key
|
||||
|
||||
# make datasets splits and tokenizer
|
||||
train = None
|
||||
valid = None
|
||||
test = None
|
||||
|
||||
if args.train_data is not None:
|
||||
train, tokenizer = data_utils.make_dataset(**data_set_args)
|
||||
if data_utils.should_split(split):
|
||||
train, valid, test = train
|
||||
eval_set_args['tokenizer'] = tokenizer
|
||||
|
||||
# make training and val dataset if necessary
|
||||
if valid is None and args.valid_data is not None:
|
||||
eval_set_args['path'] = args.valid_data
|
||||
valid, tokenizer = data_utils.make_dataset(**eval_set_args)
|
||||
eval_set_args['tokenizer'] = tokenizer
|
||||
if test is None and args.test_data is not None:
|
||||
eval_set_args['path'] = args.test_data
|
||||
test, tokenizer = data_utils.make_dataset(**eval_set_args)
|
||||
|
||||
# wrap datasets with data loader
|
||||
if train is not None and args.batch_size > 0:
|
||||
train = make_data_loader(train, batch_size, args)
|
||||
args.do_train = True
|
||||
else:
|
||||
args.do_train = False
|
||||
eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
|
||||
if valid is not None:
|
||||
valid = make_data_loader(valid, eval_batch_size, args)
|
||||
args.do_valid = True
|
||||
else:
|
||||
args.do_valid = False
|
||||
if test is not None:
|
||||
test = make_data_loader(test, eval_batch_size, args)
|
||||
args.do_test = True
|
||||
else:
|
||||
args.do_test = False
|
||||
|
||||
return (train, valid, test), tokenizer
|
||||
|
||||
|
||||
def get_split(args):
|
||||
"""
|
||||
Get dataset splits from comma separated string list
|
||||
"""
|
||||
splits = []
|
||||
if args.split.find(',') != -1:
|
||||
splits = [float(s) for s in args.split.split(',')]
|
||||
elif args.split.find('/') != -1:
|
||||
splits = [float(s) for s in args.split.split('/')]
|
||||
else:
|
||||
splits = [float(args.split)]
|
||||
split_total = sum(splits)
|
||||
if split_total < 1.:
|
||||
splits.append(1 - split_total)
|
||||
while len(splits) < 3:
|
||||
splits.append(0.)
|
||||
splits = splits[:3]
|
||||
if args.valid_data is not None:
|
||||
splits[1] = 0.
|
||||
if args.test_data is not None:
|
||||
splits[2] = 0.
|
||||
final_sum = sum(splits)
|
||||
return [s / final_sum for s in splits]
|
||||
|
||||
|
||||
def configure_data():
|
||||
"""add cmdline flags for configuring datasets"""
|
||||
# These are options that are used by data_utils, but are either
|
||||
# deprecated or not meant to be exposed to the command line user.
|
||||
# These options are intneded to be set in code by specific scripts.
|
||||
defaults = {
|
||||
'world_size': 1,
|
||||
'rank': -1,
|
||||
'persist_state': 0,
|
||||
'lazy': False,
|
||||
'transpose': False,
|
||||
'data_set_type': 'supervised',
|
||||
'seq_length': 256,
|
||||
'eval_seq_length': 256,
|
||||
'samples_per_shard': 100
|
||||
}
|
||||
|
||||
return DataConfig(defaults=defaults)
|
||||
250
modelscope/models/nlp/txl_poem/gpt2/data_utils/__init__.py
Executable file
250
modelscope/models/nlp/txl_poem/gpt2/data_utils/__init__.py
Executable file
@@ -0,0 +1,250 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""utils for creating datasets"""
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
|
||||
from . import corpora
|
||||
from .datasets import (ConcatDataset, GPT2Dataset, ShuffleDataset,
|
||||
SplitDataset, XLDataset, bert_sentencepair_dataset,
|
||||
csv_dataset, json_dataset, split_ds)
|
||||
from .lazy_loader import LazyLoader, LazyWriter, exists_lazy
|
||||
from .samplers import DistributedBatchSampler
|
||||
from .tokenization import (BertWordPieceTokenizer, CharacterLevelTokenizer,
|
||||
CommandToken, GPT2BPETokenizer, Tokenization,
|
||||
Tokenizer, make_tokenizer)
|
||||
|
||||
TRAIN_DATA = 0
|
||||
VAL_DATA = 1
|
||||
TEST_DATA = 2
|
||||
|
||||
|
||||
def should_split(split):
|
||||
"""
|
||||
given split proportions checks if should split
|
||||
Examples:
|
||||
>>> should_split([10,0,0])
|
||||
False
|
||||
>>> should_split([1,.1,.2])
|
||||
True
|
||||
"""
|
||||
return max(split) / sum(split) != 1.
|
||||
|
||||
|
||||
def get_ext(path):
|
||||
"""gets path extension"""
|
||||
return os.path.splitext(path)[1]
|
||||
|
||||
|
||||
def get_dataset(name, tokenizer, pre_tokenize, local_rank):
|
||||
"""gets dataset object based on keyword args and file at `path`"""
|
||||
if supported_corpus(name):
|
||||
dataset = corpora.NAMED_CORPORA[name]
|
||||
path = dataset.PATH
|
||||
if issubclass(dataset, corpora.PromptReader):
|
||||
if not (exists_lazy(path, data_type='prompt')
|
||||
and exists_lazy(path, data_type='text')):
|
||||
# create cached version of dataset for lazy loading if it doesn't exist
|
||||
if local_rank == 0:
|
||||
prompt_writer = LazyWriter(
|
||||
path, data_type='prompt', is_array=pre_tokenize)
|
||||
text_writer = LazyWriter(
|
||||
path, data_type='text', is_array=pre_tokenize)
|
||||
writers = {'prompt': prompt_writer, 'text': text_writer}
|
||||
dataset(
|
||||
writers=writers,
|
||||
tokenizer=tokenizer,
|
||||
tokenize=pre_tokenize)
|
||||
prompt_writer.close()
|
||||
text_writer.close()
|
||||
else:
|
||||
while not os.path.exists(
|
||||
LazyWriter.get_len_path(path, data_type='prompt')):
|
||||
time.sleep(1)
|
||||
map_fn = (lambda x: x.tolist()) if pre_tokenize else None
|
||||
prompts = LazyLoader(
|
||||
path,
|
||||
data_type='prompt',
|
||||
map_fn=map_fn,
|
||||
mem_map=True,
|
||||
is_array=pre_tokenize)
|
||||
texts = LazyLoader(
|
||||
path,
|
||||
data_type='text',
|
||||
map_fn=map_fn,
|
||||
mem_map=True,
|
||||
is_array=pre_tokenize)
|
||||
text = corpora.PromptDataset(
|
||||
prompt_loader=prompts,
|
||||
text_loader=texts,
|
||||
tokenizer=tokenizer,
|
||||
to_tokenize=not pre_tokenize)
|
||||
return text
|
||||
elif issubclass(dataset, corpora.KeyReader):
|
||||
if not (exists_lazy(path, data_type='text')
|
||||
and exists_lazy(path, data_type='mask')):
|
||||
# create cached version of dataset for lazy loading if it doesn't exist
|
||||
if local_rank == 0:
|
||||
text_writer = LazyWriter(
|
||||
path, data_type='text', is_array=pre_tokenize)
|
||||
mask_writer = LazyWriter(
|
||||
path, data_type='mask', is_array=True)
|
||||
writers = {'mask': mask_writer, 'text': text_writer}
|
||||
dataset(
|
||||
writers=writers,
|
||||
tokenizer=tokenizer,
|
||||
tokenize=pre_tokenize)
|
||||
mask_writer.close()
|
||||
text_writer.close()
|
||||
else:
|
||||
while not os.path.exists(
|
||||
LazyWriter.get_len_path(path, data_type='mask')):
|
||||
time.sleep(1)
|
||||
map_fn = (lambda x: x.tolist()) if pre_tokenize else None
|
||||
masks = LazyLoader(
|
||||
path,
|
||||
data_type='mask',
|
||||
map_fn=map_fn,
|
||||
mem_map=True,
|
||||
is_array=True)
|
||||
texts = LazyLoader(
|
||||
path,
|
||||
data_type='text',
|
||||
map_fn=map_fn,
|
||||
mem_map=True,
|
||||
is_array=pre_tokenize)
|
||||
text = corpora.KeyDataset(
|
||||
mask_loader=masks,
|
||||
text_loader=texts,
|
||||
tokenizer=tokenizer,
|
||||
to_tokenize=not pre_tokenize)
|
||||
return text
|
||||
else:
|
||||
raise NotImplementedError('dataset %s is not supported' % name)
|
||||
|
||||
|
||||
def supported_corpus(corpus_name):
|
||||
"""checks if corpus name is defined in `corpora.py`"""
|
||||
return corpus_name in corpora.NAMED_CORPORA
|
||||
|
||||
|
||||
def make_dataset(path,
|
||||
seq_length,
|
||||
mem_length,
|
||||
local_rank,
|
||||
lazy=False,
|
||||
xl_style=False,
|
||||
shuffle=True,
|
||||
split=None,
|
||||
tokenizer=None,
|
||||
tokenizer_type='CharacterLevelTokenizer',
|
||||
tokenizer_model_path=None,
|
||||
vocab_size=None,
|
||||
model_type='bpe',
|
||||
pad_token=0,
|
||||
character_converage=1.0,
|
||||
non_binary_cols=None,
|
||||
sample_one_document=False,
|
||||
pre_tokenize=False,
|
||||
**kwargs):
|
||||
"""function to create datasets+tokenizers for common options"""
|
||||
if split is None:
|
||||
split = [1.]
|
||||
if non_binary_cols is not None:
|
||||
# multilabel dataset support (only for csvs)
|
||||
label_key = non_binary_cols # noqa
|
||||
|
||||
# make tokenizer for dataset
|
||||
if tokenizer is None:
|
||||
tokenizer = make_tokenizer(tokenizer_type, None, tokenizer_model_path,
|
||||
vocab_size, model_type, pad_token,
|
||||
character_converage, **kwargs)
|
||||
|
||||
# get one or multiple datasets and concatenate
|
||||
if isinstance(path, str):
|
||||
ds = get_dataset(
|
||||
path,
|
||||
tokenizer=tokenizer,
|
||||
pre_tokenize=pre_tokenize,
|
||||
local_rank=local_rank)
|
||||
else:
|
||||
ds = [
|
||||
get_dataset(
|
||||
p,
|
||||
tokenizer=tokenizer,
|
||||
pre_tokenize=pre_tokenize,
|
||||
local_rank=local_rank) for p in path
|
||||
]
|
||||
ds = ConcatDataset(ds)
|
||||
|
||||
ds_type = ''
|
||||
if 'ds_type' in kwargs:
|
||||
ds_type = kwargs['ds_type']
|
||||
# Split dataset into train/val/test (and wrap bert dataset)
|
||||
if should_split(split):
|
||||
ds = split_ds(ds, split, shuffle=shuffle)
|
||||
if ds_type.lower() == 'bert':
|
||||
presplit_sentences = kwargs[
|
||||
'presplit_sentences'] if 'presplit_sentences' in kwargs else False
|
||||
ds = [
|
||||
bert_sentencepair_dataset(
|
||||
d,
|
||||
max_seq_len=seq_length,
|
||||
presplit_sentences=presplit_sentences)
|
||||
if d is not None else None for d in ds
|
||||
]
|
||||
elif ds_type.lower() == 'gpt2':
|
||||
if xl_style:
|
||||
ds = [
|
||||
XLDataset(
|
||||
d,
|
||||
tokenizer,
|
||||
max_seq_len=seq_length,
|
||||
mem_len=mem_length,
|
||||
sample_across_doc=not sample_one_document)
|
||||
if d is not None else None for d in ds
|
||||
]
|
||||
else:
|
||||
ds = [
|
||||
GPT2Dataset(
|
||||
d,
|
||||
tokenizer,
|
||||
max_seq_len=seq_length,
|
||||
sample_across_doc=not sample_one_document)
|
||||
if d is not None else None for d in ds
|
||||
]
|
||||
else:
|
||||
if ds_type.lower() == 'bert':
|
||||
presplit_sentences = kwargs[
|
||||
'presplit_sentences'] if 'presplit_sentences' in kwargs else False
|
||||
ds = bert_sentencepair_dataset(
|
||||
ds,
|
||||
max_seq_len=seq_length,
|
||||
presplit_sentences=presplit_sentences)
|
||||
elif ds_type.lower() == 'gpt2':
|
||||
if xl_style:
|
||||
ds = XLDataset(
|
||||
ds,
|
||||
tokenizer,
|
||||
max_seq_len=seq_length,
|
||||
mem_len=mem_length,
|
||||
sample_across_doc=not sample_one_document)
|
||||
else:
|
||||
ds = GPT2Dataset(
|
||||
ds,
|
||||
tokenizer,
|
||||
max_seq_len=seq_length,
|
||||
sample_across_doc=not sample_one_document)
|
||||
return ds, tokenizer
|
||||
366
modelscope/models/nlp/txl_poem/gpt2/data_utils/corpora.py
Executable file
366
modelscope/models/nlp/txl_poem/gpt2/data_utils/corpora.py
Executable file
@@ -0,0 +1,366 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""several datasets with preset arguments"""
|
||||
import os
|
||||
import random
|
||||
from multiprocessing import Process, Queue
|
||||
|
||||
import json
|
||||
import tqdm
|
||||
from torch.utils import data
|
||||
|
||||
from .datasets import csv_dataset, json_dataset
|
||||
from .lazy_loader import LazyLoader
|
||||
|
||||
NUM_PROCESSES = 40
|
||||
|
||||
|
||||
class webtext(json_dataset):
|
||||
"""
|
||||
dataset for webtext with arguments configured for convenience
|
||||
|
||||
command line usage: `--train-data webtext`
|
||||
"""
|
||||
PATH = 'data/webtext/data.json'
|
||||
assert_str = 'make sure to set PATH for webtext data_utils/corpora.py'
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
assert os.path.exists(webtext.PATH), \
|
||||
webtext.assert_str
|
||||
if not kwargs:
|
||||
kwargs = {}
|
||||
kwargs['text_key'] = 'text'
|
||||
kwargs['loose_json'] = True
|
||||
super(webtext, self).__init__(webtext.PATH, **kwargs)
|
||||
|
||||
|
||||
class KeyDataset(data.Dataset):
|
||||
|
||||
def __init__(self, text_loader, mask_loader, **kwargs):
|
||||
self.texts = text_loader
|
||||
self.masks = mask_loader
|
||||
self.is_lazy = False
|
||||
if isinstance(self.texts, LazyLoader) and isinstance(
|
||||
self.masks, LazyLoader):
|
||||
self.text_lens = self.texts.lens
|
||||
self.is_lazy = True
|
||||
|
||||
def get_text_len(self, idx):
|
||||
return self.text_lens[idx]
|
||||
|
||||
def __getitem__(self, index):
|
||||
text = self.texts[index]
|
||||
mask_length = self.masks[index]
|
||||
mask = []
|
||||
for i, length in enumerate(mask_length):
|
||||
if i % 2 == 0:
|
||||
mask += [0] * length
|
||||
else:
|
||||
mask += [1] * length
|
||||
assert len(text) == len(mask)
|
||||
return {'tokens': text, 'loss_masks': mask}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.texts)
|
||||
|
||||
|
||||
class PromptDataset(data.Dataset):
|
||||
|
||||
def __init__(self,
|
||||
prompt_loader,
|
||||
text_loader,
|
||||
tokenizer=None,
|
||||
to_tokenize=False,
|
||||
**kwargs):
|
||||
self.prompts = prompt_loader
|
||||
self.texts = text_loader
|
||||
self.tokenizer = tokenizer
|
||||
self.to_tokenize = to_tokenize
|
||||
if isinstance(self.prompts, LazyLoader) and isinstance(
|
||||
self.texts, LazyLoader):
|
||||
self.prompt_lens = self.prompts.lens
|
||||
self.text_lens = self.texts.lens
|
||||
self.is_lazy = True
|
||||
|
||||
def get_text_len(self, idx):
|
||||
return self.prompt_lens[idx] + self.text_lens[idx]
|
||||
|
||||
def __getitem__(self, index):
|
||||
prompt = self.prompts[index]
|
||||
text = self.texts[index]
|
||||
if self.to_tokenize:
|
||||
prompt = self.tokenizer.EncodeAsIds(prompt).tokenization
|
||||
text = self.tokenizer.EncodeAsIds(text).tokenization
|
||||
return {
|
||||
'tokens': prompt + text,
|
||||
'loss_masks': [0] * len(prompt) + [1] * len(text)
|
||||
}
|
||||
|
||||
def __len__(self):
|
||||
return len(self.prompts)
|
||||
|
||||
|
||||
class DataReader:
|
||||
PATH = None
|
||||
assert_str = None
|
||||
|
||||
@staticmethod
|
||||
def tokenize_worker(input, output, reader, tokenizer, tokenize):
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self, writers, tokenizer=None, tokenize=False, **kwargs):
|
||||
assert os.path.exists(self.PATH), self.assert_str
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenize = tokenize
|
||||
self.writers = writers
|
||||
if os.path.isdir(self.PATH):
|
||||
paths = [
|
||||
entry.path for entry in os.scandir(self.PATH)
|
||||
if not entry.is_dir() and not entry.name.endswith('bz2')
|
||||
]
|
||||
else:
|
||||
paths = [self.PATH]
|
||||
task_queue, done_queue = Queue(), Queue()
|
||||
processes = []
|
||||
for i in range(NUM_PROCESSES):
|
||||
process = Process(
|
||||
target=self.tokenize_worker,
|
||||
args=(task_queue, done_queue, type(self), tokenizer, tokenize))
|
||||
process.start()
|
||||
processes.append(process)
|
||||
for path in paths:
|
||||
with open(path) as file:
|
||||
for row in tqdm.tqdm(file):
|
||||
task_queue.put(row)
|
||||
for i in range(len(processes)):
|
||||
task_queue.put('STOP')
|
||||
count = len(processes)
|
||||
progress_bar = tqdm.tqdm()
|
||||
while True:
|
||||
data = done_queue.get()
|
||||
if data == 'COMPLETE':
|
||||
count -= 1
|
||||
if count == 0:
|
||||
break
|
||||
else:
|
||||
self.write_result(data, self.writers)
|
||||
progress_bar.update()
|
||||
progress_bar.close()
|
||||
|
||||
@staticmethod
|
||||
def write_result(data, writers):
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def get_token_count(contents):
|
||||
return sum(map(len, contents))
|
||||
|
||||
@staticmethod
|
||||
def process_sample(text, tokenizer, tokenize):
|
||||
if isinstance(text, str) and tokenize:
|
||||
text = tokenizer.EncodeAsIds(text).tokenization if text else []
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def trim_field(content, max_length):
|
||||
if len(content) > max_length:
|
||||
content = content[:max_length]
|
||||
content += '......'
|
||||
return content
|
||||
|
||||
@classmethod
|
||||
def process_line(cls, data, tokenizer, tokenize):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class PromptReader(DataReader):
|
||||
|
||||
@staticmethod
|
||||
def tokenize_worker(input, output, reader, tokenizer, tokenize):
|
||||
for row in iter(input.get, 'STOP'):
|
||||
data = json.loads(row)
|
||||
prompts, texts = reader.process_line(data, tokenizer, tokenize)
|
||||
for prompt, text in zip(prompts, texts):
|
||||
output.put((prompt, text))
|
||||
output.put('COMPLETE')
|
||||
|
||||
@staticmethod
|
||||
def write_result(data, writers):
|
||||
prompt, text = data
|
||||
writers['prompt'].write(prompt)
|
||||
writers['text'].write(text)
|
||||
|
||||
|
||||
class KeyReader(DataReader):
|
||||
PATH = '/root/data/wikipedia/wiki-key.txt'
|
||||
assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
|
||||
|
||||
@classmethod
|
||||
def process_line(cls, data, tokenizer, tokenize):
|
||||
keys, contents = data['key'], data['content']
|
||||
assert len(keys) == len(contents)
|
||||
for i in range(1, len(keys)):
|
||||
keys[i] = ' ' + keys[i]
|
||||
contents = [' ' + content for content in contents]
|
||||
keys = [tokenizer.EncodeAsIds(key).tokenization for key in keys]
|
||||
contents = [
|
||||
tokenizer.EncodeAsIds(content).tokenization for content in contents
|
||||
]
|
||||
summary = sum(keys, [])
|
||||
summary_prefix = cls.process_sample('Summary: ', tokenizer, tokenize)
|
||||
summary_mask = [len(summary_prefix), len(summary)]
|
||||
summary = summary_prefix + summary
|
||||
text, text_mask = [], []
|
||||
for key, content in zip(keys, contents):
|
||||
text += key
|
||||
text += content
|
||||
text_mask.append(len(key))
|
||||
text_mask.append(len(content))
|
||||
return (summary, summary_mask), (text, text_mask)
|
||||
|
||||
@staticmethod
|
||||
def tokenize_worker(input, output, reader, tokenizer, tokenize):
|
||||
for row in iter(input.get, 'STOP'):
|
||||
data = json.loads(row)
|
||||
summary, content = reader.process_line(data, tokenizer, tokenize)
|
||||
output.put((summary, content))
|
||||
output.put('COMPLETE')
|
||||
|
||||
@staticmethod
|
||||
def write_result(data, writers):
|
||||
summary, content = data
|
||||
writers['text'].write(summary[0])
|
||||
writers['mask'].write(summary[1])
|
||||
writers['text'].write(content[0])
|
||||
writers['mask'].write(content[1])
|
||||
|
||||
|
||||
class zhihu(PromptReader):
|
||||
PATH = '/root/data/zhihu/zhihu'
|
||||
# PATH = "data/zhihu/data.json"
|
||||
assert_str = 'make sure to set PATH for zhihu data_utils/corpora.py'
|
||||
qtitle_prefix = '问题:'
|
||||
qcontent_prefix = '问题描述:'
|
||||
user_prefix = '回答用户:'
|
||||
answer_prefix = ' 回答:'
|
||||
|
||||
# qtitle_prefix = []
|
||||
# qcontent_prefix = []
|
||||
# user_prefix = []
|
||||
# answer_prefix = []
|
||||
|
||||
@classmethod
|
||||
def process_line(cls, data, tokenizer, tokenize):
|
||||
prompts, texts = [], []
|
||||
ans_length = len(data.get('ans-content', ''))
|
||||
ans_up = data.get('ans-up-num', '')
|
||||
ans_up = int(ans_up) if ans_up else 0
|
||||
if ans_length > 100 or ans_up > 1000:
|
||||
qtitle = data['q_title']
|
||||
qcontent = data['q-content']
|
||||
if qcontent is None:
|
||||
qcontent = ''
|
||||
qcontent = cls.trim_field(qcontent, max_length=100)
|
||||
user = data.get('user-signature', '')
|
||||
prompt = cls.qtitle_prefix + qtitle + cls.qcontent_prefix + qcontent + cls.user_prefix + user + cls.answer_prefix # noqa
|
||||
text = data['ans-content']
|
||||
prompt, text = cls.process_sample(prompt, tokenizer,
|
||||
tokenize), cls.process_sample(
|
||||
text, tokenizer, tokenize)
|
||||
prompts.append(prompt)
|
||||
texts.append(text)
|
||||
# prompt = data["q_title"] + data["q-content"] + data["user-signature"]
|
||||
# text = data["ans-content"]
|
||||
# prompts.append(prompt)
|
||||
# texts.append(text)
|
||||
return prompts, texts
|
||||
|
||||
|
||||
class zhidao(PromptReader):
|
||||
PATH = '/root/data/zhidao/zhidao'
|
||||
assert_str = 'make sure to set PATH for zhidao data_utils/corpora.py'
|
||||
qtitle_prefix = '问题:'
|
||||
qcontent_prefix = '问题描述:'
|
||||
answer_prefix = '回答:'
|
||||
|
||||
@classmethod
|
||||
def process_line(cls, data, tokenizer, tokenize):
|
||||
if 'title' not in data:
|
||||
return [], []
|
||||
prompts, texts = [], []
|
||||
qtitle = data['title']
|
||||
qcontent = data.get('content', '')
|
||||
qcontent = cls.trim_field(qcontent, max_length=100)
|
||||
prompt = cls.qtitle_prefix + qtitle + cls.qcontent_prefix + qcontent + cls.answer_prefix
|
||||
prompt = cls.process_sample(prompt, tokenizer, tokenize)
|
||||
if 'best_answer' in data:
|
||||
text = data['best_answer']['content']
|
||||
if len(text) > 10:
|
||||
text = cls.process_sample(text, tokenizer, tokenize)
|
||||
prompts.append(prompt)
|
||||
texts.append(text)
|
||||
for answer in data.get('other_answers', []):
|
||||
text = answer['content']
|
||||
if len(text) > 100:
|
||||
text = cls.process_sample(text, tokenizer, tokenize)
|
||||
prompts.append(prompt)
|
||||
texts.append(text)
|
||||
return prompts, texts
|
||||
|
||||
|
||||
class baike(PromptReader):
|
||||
PATH = '/root/data/baike/baike'
|
||||
assert_str = 'make sure to set PATH for baike data_utils/corpora.py'
|
||||
|
||||
@classmethod
|
||||
def process_line(cls, data, tokenizer, tokenize):
|
||||
prompts, texts = [], []
|
||||
text = data.get('title', '') + data.get('abstract', '') + data.get(
|
||||
'content', '')
|
||||
if text:
|
||||
p, t = cls.process_sample('', tokenizer,
|
||||
tokenize), cls.process_sample(
|
||||
text, tokenizer, tokenize)
|
||||
prompts.append(p)
|
||||
texts.append(t)
|
||||
return prompts, texts
|
||||
|
||||
|
||||
class wikipedia(PromptReader):
|
||||
"""
|
||||
dataset for wikipedia with arguments configured for convenience
|
||||
|
||||
command line usage: `--train-data wikipedia`
|
||||
"""
|
||||
# PATH = '/dataset/data/wiki.txt'
|
||||
PATH = '/root/data/wikipedia/wiki.txt'
|
||||
assert_str = 'make sure to set PATH for wikipedia data_utils/corpora.py'
|
||||
|
||||
@classmethod
|
||||
def process_line(cls, data, tokenizer, tokenize):
|
||||
text = data['text']
|
||||
prompt, text = cls.process_sample('', tokenizer,
|
||||
tokenize), cls.process_sample(
|
||||
text, tokenizer, tokenize)
|
||||
return [prompt], [text]
|
||||
|
||||
|
||||
NAMED_CORPORA = {
|
||||
'wikipedia': wikipedia,
|
||||
'wikipedia-key': KeyReader,
|
||||
'webtext': webtext,
|
||||
'zhihu': zhihu,
|
||||
'zhidao': zhidao,
|
||||
'baike': baike
|
||||
}
|
||||
1060
modelscope/models/nlp/txl_poem/gpt2/data_utils/datasets.py
Executable file
1060
modelscope/models/nlp/txl_poem/gpt2/data_utils/datasets.py
Executable file
File diff suppressed because it is too large
Load Diff
70
modelscope/models/nlp/txl_poem/gpt2/data_utils/extraction.py
Executable file
70
modelscope/models/nlp/txl_poem/gpt2/data_utils/extraction.py
Executable file
@@ -0,0 +1,70 @@
|
||||
# Copyright (c) 2022 Zhipu.AI
|
||||
import glob
|
||||
import os
|
||||
|
||||
import json
|
||||
import nltk
|
||||
|
||||
nltk.download('punkt')
|
||||
|
||||
|
||||
class NLTKSegmenter:
|
||||
|
||||
def __init(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def segment_string(article):
|
||||
return nltk.tokenize.sent_tokenize(article)
|
||||
|
||||
|
||||
wiki_path = 'data/extracted'
|
||||
output_path = 'formatted/wiki-key.txt'
|
||||
segmenter = NLTKSegmenter()
|
||||
with open(output_path, 'w') as output:
|
||||
for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
|
||||
for filename in glob.glob(
|
||||
os.path.join(dirname, 'wiki_*'), recursive=True):
|
||||
print(filename)
|
||||
article_lines = []
|
||||
article_open = False
|
||||
with open(filename, mode='r', newline='\n') as file:
|
||||
for line in file:
|
||||
line = line.rstrip()
|
||||
if '<doc id=' in line:
|
||||
article_open = True
|
||||
elif '</doc>' in line:
|
||||
key_sentences, contents = [], []
|
||||
key, content = None, []
|
||||
for sentences in article_lines[1:]:
|
||||
if len(sentences) > 1:
|
||||
if key:
|
||||
if len(content) > 0 or len(contents) == 0:
|
||||
key_sentences.append(key)
|
||||
contents.append(content)
|
||||
else:
|
||||
contents[-1].append(key)
|
||||
key, content = None, []
|
||||
key_sentences.append(sentences[0])
|
||||
contents.append(sentences[1:])
|
||||
elif len(sentences) > 0:
|
||||
if key:
|
||||
content.append(sentences[0])
|
||||
else:
|
||||
key = sentences[0]
|
||||
if key:
|
||||
if len(content) > 0 or len(contents) == 0:
|
||||
key_sentences.append(key)
|
||||
contents.append(content)
|
||||
else:
|
||||
contents[-1].append(key)
|
||||
contents = [' '.join(content) for content in contents]
|
||||
article = {'key': key_sentences, 'content': contents}
|
||||
output.write(json.dumps(article))
|
||||
output.write('\n')
|
||||
article_open = False
|
||||
article_lines = []
|
||||
else:
|
||||
if article_open and line:
|
||||
sentences = segmenter.segment_string(line)
|
||||
article_lines.append(sentences)
|
||||
259
modelscope/models/nlp/txl_poem/gpt2/data_utils/file_utils.py
Executable file
259
modelscope/models/nlp/txl_poem/gpt2/data_utils/file_utils.py
Executable file
@@ -0,0 +1,259 @@
|
||||
# This file is provided as is from:
|
||||
# https://github.com/huggingface/pytorch-pretrained-BERT
|
||||
# Please refer to their repository for copyright.
|
||||
"""
|
||||
Utilities for working with the local dataset cache.
|
||||
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
|
||||
Copyright by the AllenNLP authors.
|
||||
"""
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from functools import wraps
|
||||
from hashlib import sha256
|
||||
from io import open
|
||||
|
||||
import boto3
|
||||
import json
|
||||
import requests
|
||||
from botocore.exceptions import ClientError
|
||||
from tqdm import tqdm
|
||||
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
except ImportError:
|
||||
from urlparse import urlparse
|
||||
|
||||
try:
|
||||
from pathlib import Path
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = Path(
|
||||
os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
|
||||
Path.home() / '.pytorch_pretrained_bert'))
|
||||
except (AttributeError, ImportError):
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
|
||||
'PYTORCH_PRETRAINED_BERT_CACHE',
|
||||
os.path.join(os.path.expanduser('~'), '.pytorch_pretrained_bert'))
|
||||
|
||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def url_to_filename(url, etag=None):
|
||||
"""
|
||||
Convert `url` into a hashed filename in a repeatable way.
|
||||
If `etag` is specified, append its hash to the url's, delimited
|
||||
by a period.
|
||||
"""
|
||||
url_bytes = url.encode('utf-8')
|
||||
url_hash = sha256(url_bytes)
|
||||
filename = url_hash.hexdigest()
|
||||
|
||||
if etag:
|
||||
etag_bytes = etag.encode('utf-8')
|
||||
etag_hash = sha256(etag_bytes)
|
||||
filename += '.' + etag_hash.hexdigest()
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def filename_to_url(filename, cache_dir=None):
|
||||
"""
|
||||
Return the url and etag (which may be ``None``) stored for `filename`.
|
||||
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
cache_path = os.path.join(cache_dir, filename)
|
||||
if not os.path.exists(cache_path):
|
||||
raise EnvironmentError('file {} not found'.format(cache_path))
|
||||
|
||||
meta_path = cache_path + '.json'
|
||||
if not os.path.exists(meta_path):
|
||||
raise EnvironmentError('file {} not found'.format(meta_path))
|
||||
|
||||
with open(meta_path, encoding='utf-8') as meta_file:
|
||||
metadata = json.load(meta_file)
|
||||
url = metadata['url']
|
||||
etag = metadata['etag']
|
||||
|
||||
return url, etag
|
||||
|
||||
|
||||
def cached_path(url_or_filename, cache_dir=None):
|
||||
"""
|
||||
Given something that might be a URL (or might be a local path),
|
||||
determine which. If it's a URL, download the file and cache it, and
|
||||
return the path to the cached file. If it's already a local path,
|
||||
make sure the file exists and then return the path.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
|
||||
url_or_filename = str(url_or_filename)
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
parsed = urlparse(url_or_filename)
|
||||
|
||||
if parsed.scheme in ('http', 'https', 's3'):
|
||||
# URL, so get it from the cache (downloading if necessary)
|
||||
return get_from_cache(url_or_filename, cache_dir)
|
||||
elif os.path.exists(url_or_filename):
|
||||
# File, and it exists.
|
||||
return url_or_filename
|
||||
elif parsed.scheme == '':
|
||||
# File, but it doesn't exist.
|
||||
raise EnvironmentError('file {} not found'.format(url_or_filename))
|
||||
else:
|
||||
# Something unknown
|
||||
raise ValueError(
|
||||
'unable to parse {} as a URL or as a local path'.format(
|
||||
url_or_filename))
|
||||
|
||||
|
||||
def split_s3_path(url):
|
||||
"""Split a full s3 path into the bucket name and path."""
|
||||
parsed = urlparse(url)
|
||||
if not parsed.netloc or not parsed.path:
|
||||
raise ValueError('bad s3 path {}'.format(url))
|
||||
bucket_name = parsed.netloc
|
||||
s3_path = parsed.path
|
||||
# Remove '/' at beginning of path.
|
||||
if s3_path.startswith('/'):
|
||||
s3_path = s3_path[1:]
|
||||
return bucket_name, s3_path
|
||||
|
||||
|
||||
def s3_request(func):
|
||||
"""
|
||||
Wrapper function for s3 requests in order to create more helpful error
|
||||
messages.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(url, *args, **kwargs):
|
||||
try:
|
||||
return func(url, *args, **kwargs)
|
||||
except ClientError as exc:
|
||||
if int(exc.response['Error']['Code']) == 404:
|
||||
raise EnvironmentError('file {} not found'.format(url))
|
||||
else:
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@s3_request
|
||||
def s3_etag(url):
|
||||
"""Check ETag on S3 object."""
|
||||
s3_resource = boto3.resource('s3')
|
||||
bucket_name, s3_path = split_s3_path(url)
|
||||
s3_object = s3_resource.Object(bucket_name, s3_path)
|
||||
return s3_object.e_tag
|
||||
|
||||
|
||||
@s3_request
|
||||
def s3_get(url, temp_file):
|
||||
"""Pull a file directly from S3."""
|
||||
s3_resource = boto3.resource('s3')
|
||||
bucket_name, s3_path = split_s3_path(url)
|
||||
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
||||
|
||||
|
||||
def http_get(url, temp_file):
|
||||
req = requests.get(url, stream=True)
|
||||
content_length = req.headers.get('Content-Length')
|
||||
total = int(content_length) if content_length is not None else None
|
||||
progress = tqdm(unit='B', total=total)
|
||||
for chunk in req.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
progress.update(len(chunk))
|
||||
temp_file.write(chunk)
|
||||
progress.close()
|
||||
|
||||
|
||||
def get_from_cache(url, cache_dir=None):
|
||||
"""
|
||||
Given a URL, look for the corresponding dataset in the local cache.
|
||||
If it's not there, download it. Then return the path to the cached file.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
if not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir)
|
||||
|
||||
# Get eTag to add to filename, if it exists.
|
||||
if url.startswith('s3://'):
|
||||
etag = s3_etag(url)
|
||||
else:
|
||||
response = requests.head(url, allow_redirects=True)
|
||||
if response.status_code != 200:
|
||||
raise IOError(
|
||||
'HEAD request failed for url {} with status code {}'.format(
|
||||
url, response.status_code))
|
||||
etag = response.headers.get('ETag')
|
||||
|
||||
filename = url_to_filename(url, etag)
|
||||
|
||||
# get cache path to put the file
|
||||
cache_path = os.path.join(cache_dir, filename)
|
||||
|
||||
if not os.path.exists(cache_path):
|
||||
# Download to temporary file, then copy to cache dir once finished.
|
||||
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
||||
with tempfile.NamedTemporaryFile() as temp_file:
|
||||
logger.info('%s not found in cache, downloading to %s', url,
|
||||
temp_file.name)
|
||||
|
||||
# GET file object
|
||||
if url.startswith('s3://'):
|
||||
s3_get(url, temp_file)
|
||||
else:
|
||||
http_get(url, temp_file)
|
||||
|
||||
# we are copying the file before closing it, so flush to avoid truncation
|
||||
temp_file.flush()
|
||||
# shutil.copyfileobj() starts at the current position, so go to the start
|
||||
temp_file.seek(0)
|
||||
|
||||
logger.info('copying %s to cache at %s', temp_file.name,
|
||||
cache_path)
|
||||
with open(cache_path, 'wb') as cache_file:
|
||||
shutil.copyfileobj(temp_file, cache_file)
|
||||
|
||||
logger.info('creating metadata file for %s', cache_path)
|
||||
meta = {'url': url, 'etag': etag}
|
||||
meta_path = cache_path + '.json'
|
||||
with open(meta_path, 'w', encoding='utf-8') as meta_file:
|
||||
json.dump(meta, meta_file)
|
||||
|
||||
logger.info('removing temp file %s', temp_file.name)
|
||||
|
||||
return cache_path
|
||||
|
||||
|
||||
def read_set_from_file(filename):
|
||||
'''
|
||||
Extract a de-duped collection (set) of text from a file.
|
||||
Expected file format is one item per line.
|
||||
'''
|
||||
collection = set()
|
||||
with open(filename, 'r', encoding='utf-8') as file_:
|
||||
for line in file_:
|
||||
collection.add(line.rstrip())
|
||||
return collection
|
||||
|
||||
|
||||
def get_file_extension(path, dot=True, lower=True):
|
||||
ext = os.path.splitext(path)[1]
|
||||
ext = ext if dot else ext[1:]
|
||||
return ext.lower() if lower else ext
|
||||
253
modelscope/models/nlp/txl_poem/gpt2/data_utils/lazy_loader.py
Executable file
253
modelscope/models/nlp/txl_poem/gpt2/data_utils/lazy_loader.py
Executable file
@@ -0,0 +1,253 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""utils for loading text from disk"""
|
||||
import mmap
|
||||
import os
|
||||
import pickle as pkl
|
||||
import time
|
||||
from itertools import accumulate
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.multiprocessing import Lock
|
||||
|
||||
|
||||
def get_lazy_path(path):
|
||||
"""
|
||||
Gets directory path where lazy files are stored.
|
||||
"""
|
||||
return os.path.splitext(path)[0] + '.lazy'
|
||||
|
||||
|
||||
def exists_lazy(path, data_type='data'):
|
||||
"""
|
||||
Check if we've already made a lazy version of this file for the `data_type` field.
|
||||
"""
|
||||
if not os.path.exists(get_lazy_path(path)):
|
||||
return False
|
||||
contents = os.listdir(get_lazy_path(path))
|
||||
if data_type not in contents:
|
||||
return False
|
||||
if data_type + '.len.pkl' not in contents:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class LazyWriter:
|
||||
|
||||
def __init__(self,
|
||||
path,
|
||||
data_type,
|
||||
is_array=False,
|
||||
array_data_type=np.int32):
|
||||
lazypath = get_lazy_path(path)
|
||||
if not os.path.exists(lazypath):
|
||||
os.makedirs(lazypath)
|
||||
self.datapath = os.path.join(lazypath, data_type)
|
||||
self.lenpath = os.path.join(lazypath, data_type + '.len.pkl')
|
||||
self.array_data_type = array_data_type
|
||||
self.output = open(self.datapath, 'wb')
|
||||
self.lengths = []
|
||||
self.is_array = is_array
|
||||
|
||||
@staticmethod
|
||||
def get_len_path(path, data_type):
|
||||
lazypath = get_lazy_path(path)
|
||||
return os.path.join(lazypath, data_type + '.len.pkl')
|
||||
|
||||
def write(self, s):
|
||||
if isinstance(s, dict):
|
||||
s = s['text']
|
||||
if self.is_array:
|
||||
encoded = np.array(
|
||||
s, dtype=self.array_data_type).tobytes(order='C')
|
||||
self.output.write(encoded)
|
||||
self.lengths.append(len(s))
|
||||
else:
|
||||
encoded = s.encode('utf-8')
|
||||
self.output.write(encoded)
|
||||
self.lengths.append(len(encoded))
|
||||
|
||||
def close(self):
|
||||
self.output.close()
|
||||
with open(self.lenpath, 'wb') as f:
|
||||
pkl.dump(self.lengths, f)
|
||||
|
||||
|
||||
def split_strings(strings, start, chr_lens):
|
||||
"""
|
||||
Split strings based on string lengths and given start.
|
||||
"""
|
||||
return [
|
||||
strings[i - start:j - start]
|
||||
for i, j in zip([start] + chr_lens[:-1], chr_lens)
|
||||
]
|
||||
|
||||
|
||||
class ProcessorTokenizer:
|
||||
"""
|
||||
callable class that runs a preprocessing, as well as tokenization step,
|
||||
on input text.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer, process_fn=None):
|
||||
self.tokenizer = tokenizer
|
||||
self.process_fn = process_fn
|
||||
|
||||
def __call__(self, string):
|
||||
if self.tokenizer is not None:
|
||||
string = self.tokenizer(string, process_fn=self.process_fn)
|
||||
elif self.process_fn is not None:
|
||||
string = self.process_fn(string)
|
||||
return string
|
||||
|
||||
|
||||
class LazyLoader(object):
|
||||
"""
|
||||
Arguments:
|
||||
path: path to directory where array entries are concatenated into one big string file
|
||||
and the .len file are located
|
||||
data_type (str): Some datsets have multiple fields that are stored in different paths.
|
||||
`data_type` specifies which of these fields to load in this class
|
||||
mem_map (boolean): Specifies whether to memory map file `path`
|
||||
map_fn (callable): Fetched strings are passed through map_fn before being returned.
|
||||
|
||||
Example of lazy loader directory structure:
|
||||
file.json
|
||||
file.lazy/
|
||||
data_type1
|
||||
data_type1.len.pkl
|
||||
data_type2
|
||||
data_type2.len.pkl
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
path,
|
||||
data_type='data',
|
||||
mem_map=False,
|
||||
map_fn=None,
|
||||
is_array=False,
|
||||
array_data_type=np.int32):
|
||||
lazypath = get_lazy_path(path)
|
||||
datapath = os.path.join(lazypath, data_type)
|
||||
# get file where array entries are concatenated into one big string
|
||||
self._file = open(datapath, 'rb')
|
||||
self.file = self._file
|
||||
self.is_array = is_array
|
||||
self.array_data_type = array_data_type
|
||||
# memory map file if necessary
|
||||
lenpath = os.path.join(lazypath, data_type + '.len.pkl')
|
||||
self.lens = pkl.load(open(lenpath, 'rb'))
|
||||
self.ends = list(accumulate(self.lens))
|
||||
self.dumb_ends = list(self.ends)
|
||||
self.mem_map = mem_map
|
||||
if self.mem_map:
|
||||
if is_array:
|
||||
if self.ends[-1] == 0:
|
||||
self.file = np.array([], dtype=array_data_type)
|
||||
else:
|
||||
self.file = np.memmap(
|
||||
self.file, dtype=array_data_type, mode='r', order='C')
|
||||
else:
|
||||
if self.ends[-1] == 0:
|
||||
self.file = bytearray()
|
||||
else:
|
||||
self.file = mmap.mmap(
|
||||
self.file.fileno(), 0, prot=mmap.PROT_READ)
|
||||
self.read_lock = Lock()
|
||||
self.process_fn = map_fn
|
||||
self.map_fn = map_fn
|
||||
self._tokenizer = None
|
||||
self.is_lazy = True
|
||||
|
||||
def SetTokenizer(self, tokenizer):
|
||||
"""
|
||||
logic to set and remove (set to None) tokenizer.
|
||||
combines preprocessing/tokenization into one callable.
|
||||
"""
|
||||
if tokenizer is None:
|
||||
if not hasattr(self, '_tokenizer'):
|
||||
self._tokenizer = tokenizer
|
||||
else:
|
||||
self._tokenizer = tokenizer
|
||||
self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
|
||||
|
||||
def GetTokenizer(self):
|
||||
return self._tokenizer
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""
|
||||
read file and splice strings based on string ending array `self.ends`
|
||||
"""
|
||||
if not isinstance(index, slice):
|
||||
if index == 0:
|
||||
start = 0
|
||||
else:
|
||||
start = self.ends[index - 1]
|
||||
end = self.ends[index]
|
||||
rtn = self.file_read(start, end)
|
||||
if self.map_fn is not None:
|
||||
return self.map_fn(rtn)
|
||||
else:
|
||||
# if slice, fetch strings with 1 diskread and then splice in memory
|
||||
chr_lens = self.ends[index]
|
||||
if index.start == 0 or index.start is None:
|
||||
start = 0
|
||||
else:
|
||||
start = self.ends[index.start - 1]
|
||||
stop = chr_lens[-1]
|
||||
strings = self.file_read(start, stop)
|
||||
rtn = split_strings(strings, start, chr_lens)
|
||||
if self.map_fn is not None:
|
||||
return self.map_fn([s for s in rtn])
|
||||
return rtn
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ends)
|
||||
|
||||
def file_read(self, start=0, end=None):
|
||||
"""read specified portion of file"""
|
||||
data_type_size = np.dtype(self.array_data_type).itemsize
|
||||
# atomic reads to avoid race conditions with multiprocess dataloader
|
||||
self.read_lock.acquire()
|
||||
if not self.mem_map:
|
||||
# seek to start of file read
|
||||
if self.is_array:
|
||||
start = start * data_type_size
|
||||
end = end * data_type_size if end is not None else None
|
||||
self.file.seek(start)
|
||||
# read to end of file if no end point provided
|
||||
if end is None:
|
||||
rtn = self.file.read()
|
||||
# else read amount needed to reach end point
|
||||
else:
|
||||
rtn = self.file.read(end - start)
|
||||
if self.is_array:
|
||||
rtn = np.ndarray(
|
||||
shape=(len(rtn) / data_type_size, ),
|
||||
dtype=self.array_data_type,
|
||||
buffer=rtn,
|
||||
order='C')
|
||||
else:
|
||||
rtn = rtn.decode('utf-8', 'ignore')
|
||||
else:
|
||||
rtn = self.file[start:end]
|
||||
if self.is_array:
|
||||
rtn = rtn.copy()
|
||||
else:
|
||||
rtn = rtn.decode('utf-8', 'strict')
|
||||
self.read_lock.release()
|
||||
# TODO: @raulp figure out mem map byte string bug
|
||||
# if mem map'd need to decode byte string to string
|
||||
return rtn
|
||||
195
modelscope/models/nlp/txl_poem/gpt2/data_utils/samplers.py
Executable file
195
modelscope/models/nlp/txl_poem/gpt2/data_utils/samplers.py
Executable file
@@ -0,0 +1,195 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""batch samplers that work with either random or sequential data samplers"""
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils import data
|
||||
|
||||
|
||||
class RandomSampler(data.sampler.Sampler):
|
||||
r"""
|
||||
Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
|
||||
but this class lets the user set an epoch like DistributedSampler
|
||||
Samples elements randomly. If without replacement, then sample from a shuffled dataset.
|
||||
If with replacement, then user can specify ``num_samples`` to draw.
|
||||
Arguments:
|
||||
data_source (Dataset): dataset to sample from
|
||||
num_samples (int): number of samples to draw, default=len(dataset)
|
||||
replacement (bool): samples are drawn with replacement if ``True``, default=False
|
||||
"""
|
||||
|
||||
def __init__(self, data_source, replacement=False, num_samples=None):
|
||||
self.data_source = data_source
|
||||
self.replacement = replacement
|
||||
self._num_samples = num_samples
|
||||
self.epoch = -1
|
||||
|
||||
if self._num_samples is not None and replacement is False:
|
||||
raise ValueError(
|
||||
'With replacement=False, num_samples should not be specified, '
|
||||
'since a random permute will be performed.')
|
||||
|
||||
if not isinstance(self.num_samples, int) or self.num_samples <= 0:
|
||||
raise ValueError('num_samples should be a positive integer '
|
||||
'value, but got num_samples={}'.format(
|
||||
self.num_samples))
|
||||
if not isinstance(self.replacement, bool):
|
||||
raise ValueError('replacement should be a boolean value, but got '
|
||||
'replacement={}'.format(self.replacement))
|
||||
|
||||
@property
|
||||
def num_samples(self):
|
||||
# dataset size might change at runtime
|
||||
if self._num_samples is None:
|
||||
return len(self.data_source)
|
||||
return self._num_samples
|
||||
|
||||
def __iter__(self):
|
||||
n = len(self.data_source)
|
||||
g = torch.Generator()
|
||||
if self.epoch >= 0:
|
||||
g.manual_seed(self.epoch)
|
||||
if self.replacement:
|
||||
return iter(
|
||||
torch.randint(
|
||||
high=n,
|
||||
size=(self.num_samples, ),
|
||||
dtype=torch.int64,
|
||||
generator=g).tolist())
|
||||
return iter(torch.randperm(n, generator=g).tolist())
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples
|
||||
|
||||
def set_epoch(self, epoch):
|
||||
self.epoch = epoch
|
||||
|
||||
|
||||
class DistributedSequentialSampler(data.sampler.Sampler):
|
||||
|
||||
def __init__(self,
|
||||
num_samples,
|
||||
train_iters,
|
||||
batch_size,
|
||||
rank=-1,
|
||||
world_size=2):
|
||||
super().__init__(num_samples)
|
||||
if rank == -1:
|
||||
rank = 0
|
||||
world_size = 1
|
||||
self.num_samples = num_samples
|
||||
self.rank = rank
|
||||
self.world_size = world_size
|
||||
self.start_iter = 0
|
||||
self.train_iters = train_iters
|
||||
self.batch_size = batch_size
|
||||
self.batch_bias = [
|
||||
i * (num_samples // batch_size) for i in range(batch_size)
|
||||
]
|
||||
|
||||
def __iter__(self):
|
||||
for idx in range(self.start_iter, self.train_iters * 10):
|
||||
batch = [(idx + bias) % self.num_samples
|
||||
for bias in self.batch_bias]
|
||||
tbatch = self._batch(batch)
|
||||
yield tbatch
|
||||
|
||||
def __len__(self):
|
||||
return self.train_iters
|
||||
|
||||
def _batch(self, batch):
|
||||
"""extracts samples only pertaining to this worker's batch"""
|
||||
start = self.rank * self.batch_size // self.world_size
|
||||
end = (self.rank + 1) * self.batch_size // self.world_size
|
||||
return batch[start:end]
|
||||
|
||||
|
||||
class DistributedBatchSampler(data.sampler.BatchSampler):
|
||||
"""
|
||||
similar to normal implementation of distributed sampler, except implementation is at the
|
||||
batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
|
||||
data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
sampler,
|
||||
batch_size,
|
||||
drop_last,
|
||||
rank=-1,
|
||||
world_size=2,
|
||||
wrap_last=False,
|
||||
gradient_accumulation_steps=None):
|
||||
super(DistributedBatchSampler, self).__init__(sampler, batch_size,
|
||||
drop_last)
|
||||
if rank == -1:
|
||||
assert False, 'should not be here'
|
||||
self.rank = rank
|
||||
self.world_size = world_size
|
||||
self.sampler.wrap_around = 0
|
||||
self.wrap_around = 0
|
||||
self.wrap_last = wrap_last
|
||||
self.start_iter = 0
|
||||
self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps # noqa
|
||||
|
||||
def __iter__(self):
|
||||
batch = []
|
||||
i = 0
|
||||
for idx in self.data_iterator(self.sampler, wrap_around=False):
|
||||
batch.append(idx)
|
||||
if len(batch) == self.batch_size:
|
||||
tbatch = self._batch(batch)
|
||||
if i >= self.start_iter * self.effective_batch_size:
|
||||
yield tbatch
|
||||
self.start_iter = 0
|
||||
i += len(batch)
|
||||
batch = []
|
||||
batch_len = len(batch)
|
||||
if batch_len > 0 and not self.drop_last:
|
||||
if self.wrap_last:
|
||||
self.sampler.wrap_around -= (self.batch_size)
|
||||
self.wrap_around += (len(batch))
|
||||
self.wrap_around %= self.batch_size
|
||||
if isinstance(self.sampler, TransposedSampler):
|
||||
for i, idx in enumerate(
|
||||
self.data_iterator(self.sampler,
|
||||
wrap_around=True)):
|
||||
if i == 0:
|
||||
continue
|
||||
batch.append(idx)
|
||||
new_batch_len = len(batch) # noqa
|
||||
if len(batch) == self.batch_size:
|
||||
break
|
||||
yield self._batch(batch)
|
||||
if self.wrap_last:
|
||||
self.sampler.wrap_around += self.batch_size
|
||||
|
||||
def data_iterator(self, _iter, wrap_around=False):
|
||||
"""iterates through data and handles wrap around"""
|
||||
for i, idx in enumerate(_iter):
|
||||
if i < self.wrap_around % self.batch_size:
|
||||
continue
|
||||
if wrap_around:
|
||||
self.wrap_around += 1
|
||||
self.wrap_around %= self.batch_size
|
||||
yield idx
|
||||
|
||||
def _batch(self, batch):
|
||||
"""extracts samples only pertaining to this worker's batch"""
|
||||
start = self.rank * self.batch_size // self.world_size
|
||||
end = (self.rank + 1) * self.batch_size // self.world_size
|
||||
return batch[start:end]
|
||||
140
modelscope/models/nlp/txl_poem/gpt2/data_utils/sp_tokenizer.py
Executable file
140
modelscope/models/nlp/txl_poem/gpt2/data_utils/sp_tokenizer.py
Executable file
@@ -0,0 +1,140 @@
|
||||
# modified by Zhipu.Ai
|
||||
import os
|
||||
|
||||
import json
|
||||
import sentencepiece as spm
|
||||
|
||||
|
||||
def get_pairs(word):
|
||||
pairs = set()
|
||||
prev_char = word[0]
|
||||
for char in word[1:]:
|
||||
pairs.add((prev_char, char))
|
||||
prev_char = char
|
||||
return pairs
|
||||
|
||||
|
||||
class Encoder:
|
||||
|
||||
def __init__(self, encoder, bpe_merges):
|
||||
self.encoder = encoder
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
self.cache = {}
|
||||
self.max_len = 0
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
word = tuple(token)
|
||||
pairs = get_pairs(word)
|
||||
if not pairs:
|
||||
return token
|
||||
|
||||
while True:
|
||||
bigram = min(
|
||||
pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except: # noqa
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word) - 1 and word[
|
||||
i + 1] == second:
|
||||
new_word.append(first + second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = ' '.join(word)
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def encode(self, text):
|
||||
return [self.encoder.get(token, 1) for token in self.tokenize(text)]
|
||||
|
||||
def decode(self, tokens):
|
||||
text = ''.join([self.decoder[token] for token in tokens])
|
||||
return text
|
||||
|
||||
def tokenize(self, text):
|
||||
bpe_tokens = []
|
||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' '))
|
||||
return bpe_tokens
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
return [self.encoder.get(token, 1) for token in tokens]
|
||||
|
||||
|
||||
class Encoder_SP:
|
||||
|
||||
def __init__(self, model_path):
|
||||
self.sp = spm.SentencePieceProcessor()
|
||||
self.sp.Load(model_path)
|
||||
|
||||
def encode(self, text):
|
||||
"""
|
||||
text="...."
|
||||
"""
|
||||
return self.sp.EncodeAsIds(text)
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
tokens=[x1,x2,...]
|
||||
"""
|
||||
text = [int(token) for token in tokens]
|
||||
# print(text)
|
||||
return self.sp.DecodeIds(text)
|
||||
|
||||
def tokenize(self, text):
|
||||
return self.sp.EncodeAsPieces(text)
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
return [self.sp.PieceToId(token) for token in tokens]
|
||||
|
||||
def convert_token_to_id(self, token):
|
||||
return self.sp.PieceToId(token)
|
||||
|
||||
def convert_id_to_token(self, idx):
|
||||
return self.sp.IdToPiece(idx)
|
||||
|
||||
|
||||
def get_encoder(encoder_file, bpe_file):
|
||||
filepath, filename = os.path.split(encoder_file)
|
||||
shotname, extension = os.path.splitext(filename)
|
||||
|
||||
if ('.model' == extension) and (bpe_file == ''):
|
||||
return Encoder_SP(encoder_file)
|
||||
else:
|
||||
with open(encoder_file, 'r', encoding='utf-8') as f:
|
||||
encoder = json.load(f)
|
||||
with open(bpe_file, 'r', encoding='utf-8') as f:
|
||||
bpe_data = f.read()
|
||||
bpe_merges = [
|
||||
tuple(merge_str.split())
|
||||
for merge_str in bpe_data.split('\n')[1:-1]
|
||||
]
|
||||
return Encoder(
|
||||
encoder=encoder,
|
||||
bpe_merges=bpe_merges,
|
||||
)
|
||||
|
||||
|
||||
def from_pretrained(model_path):
|
||||
return get_encoder(
|
||||
model_path + '/chinese_sentencepiece/cog-pretrain.model', '')
|
||||
160
modelscope/models/nlp/txl_poem/gpt2/data_utils/tf_dl.py
Executable file
160
modelscope/models/nlp/txl_poem/gpt2/data_utils/tf_dl.py
Executable file
@@ -0,0 +1,160 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch DataLoader for TFRecords"""
|
||||
|
||||
import queue
|
||||
import threading
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
|
||||
tf.enable_eager_execution()
|
||||
|
||||
|
||||
class TFRecordDataLoader(object):
|
||||
|
||||
def __init__(self,
|
||||
records,
|
||||
batch_size,
|
||||
max_seq_len,
|
||||
max_preds_per_seq,
|
||||
train,
|
||||
num_workers=2,
|
||||
seed=1,
|
||||
threaded_dl=False):
|
||||
assert max_preds_per_seq is not None, '--max-preds-per-seq MUST BE SPECIFIED when using tfrecords'
|
||||
tf.set_random_seed(seed)
|
||||
if isinstance(records, str):
|
||||
records = [records]
|
||||
|
||||
self.record_converter = Record2Example({
|
||||
'input_ids':
|
||||
tf.FixedLenFeature([max_seq_len], tf.int64),
|
||||
'input_mask':
|
||||
tf.FixedLenFeature([max_seq_len], tf.int64),
|
||||
'segment_ids':
|
||||
tf.FixedLenFeature([max_seq_len], tf.int64),
|
||||
'masked_lm_positions':
|
||||
tf.FixedLenFeature([max_preds_per_seq], tf.int64),
|
||||
'masked_lm_ids':
|
||||
tf.FixedLenFeature([max_preds_per_seq], tf.int64),
|
||||
'masked_lm_weights':
|
||||
tf.FixedLenFeature([max_preds_per_seq], tf.float32),
|
||||
'next_sentence_labels':
|
||||
tf.FixedLenFeature([1], tf.int64)
|
||||
})
|
||||
|
||||
# Instantiate dataset according to original BERT implementation
|
||||
if train:
|
||||
self.dataset = tf.data.Dataset.from_tensor_slices(
|
||||
tf.constant(records))
|
||||
self.dataset = self.dataset.repeat()
|
||||
self.dataset = self.dataset.shuffle(buffer_size=len(records))
|
||||
|
||||
# use sloppy tfrecord dataset
|
||||
self.dataset = self.dataset.apply(
|
||||
tf.contrib.data.parallel_interleave(
|
||||
tf.data.TFRecordDataset,
|
||||
sloppy=train,
|
||||
cycle_length=min(num_workers, len(records))))
|
||||
self.dataset = self.dataset.shuffle(buffer_size=100)
|
||||
else:
|
||||
self.dataset = tf.data.TFRecordDataset(records)
|
||||
self.dataset = self.dataset.repeat()
|
||||
|
||||
# Instantiate dataloader (do not drop remainder for eval)
|
||||
loader_args = {
|
||||
'batch_size': batch_size,
|
||||
'num_parallel_batches': num_workers,
|
||||
'drop_remainder': train
|
||||
}
|
||||
self.dataloader = self.dataset.apply(
|
||||
tf.contrib.data.map_and_batch(self.record_converter,
|
||||
**loader_args))
|
||||
self.threaded_dl = threaded_dl
|
||||
self.num_workers = num_workers
|
||||
|
||||
def __iter__(self):
|
||||
if self.threaded_dl:
|
||||
data_iter = iter(
|
||||
MultiprocessLoader(self.dataloader, self.num_workers))
|
||||
for item in data_iter:
|
||||
yield item
|
||||
else:
|
||||
data_iter = iter(self.dataloader)
|
||||
for item in data_iter:
|
||||
yield convert_tf_example_to_torch_tensors(item)
|
||||
|
||||
|
||||
class Record2Example(object):
|
||||
|
||||
def __init__(self, feature_map):
|
||||
self.feature_map = feature_map
|
||||
|
||||
def __call__(self, record):
|
||||
"""Decodes a BERT TF record to a TF example."""
|
||||
example = tf.parse_single_example(record, self.feature_map)
|
||||
for k, v in list(example.items()):
|
||||
if v.dtype == tf.int64:
|
||||
example[k] = tf.to_int32(v)
|
||||
return example
|
||||
|
||||
|
||||
def convert_tf_example_to_torch_tensors(example):
|
||||
item = {k: (v.numpy()) for k, v in example.items()}
|
||||
mask = np.zeros_like(item['input_ids'])
|
||||
mask_labels = np.ones_like(item['input_ids']) * -1
|
||||
for b, row in enumerate(item['masked_lm_positions'].astype(int)):
|
||||
for i, idx in enumerate(row):
|
||||
if item['masked_lm_weights'][b, i] != 0:
|
||||
mask[b, idx] = 1
|
||||
mask_labels[b, idx] = item['masked_lm_ids'][b, i]
|
||||
output = {
|
||||
'text': item['input_ids'],
|
||||
'types': item['segment_ids'],
|
||||
'is_random': item['next_sentence_labels'],
|
||||
'pad_mask': 1 - item['input_mask'],
|
||||
'mask': mask,
|
||||
'mask_labels': mask_labels
|
||||
}
|
||||
return {k: torch.from_numpy(v) for k, v in output.items()}
|
||||
|
||||
|
||||
class MultiprocessLoader(object):
|
||||
|
||||
def __init__(self, dataloader, num_workers=2):
|
||||
self.dl = dataloader
|
||||
self.queue_size = 2 * num_workers
|
||||
|
||||
def __iter__(self):
|
||||
output_queue = queue.Queue(self.queue_size)
|
||||
output_thread = threading.Thread(
|
||||
target=_multiproc_iter, args=(self.dl, output_queue))
|
||||
output_thread.daemon = True
|
||||
output_thread.start()
|
||||
|
||||
while output_thread.is_alive():
|
||||
yield output_queue.get(block=True)
|
||||
else:
|
||||
print(
|
||||
RuntimeError(
|
||||
'TF record data loader thread exited unexpectedly'))
|
||||
|
||||
|
||||
def _multiproc_iter(dl, output_queue):
|
||||
data_iter = iter(dl)
|
||||
for item in data_iter:
|
||||
tensors = convert_tf_example_to_torch_tensors(item)
|
||||
output_queue.put(tensors, block=True)
|
||||
1118
modelscope/models/nlp/txl_poem/gpt2/data_utils/tokenization.py
Executable file
1118
modelscope/models/nlp/txl_poem/gpt2/data_utils/tokenization.py
Executable file
File diff suppressed because it is too large
Load Diff
359
modelscope/models/nlp/txl_poem/gpt2/data_utils/tokenization_gpt2.py
Executable file
359
modelscope/models/nlp/txl_poem/gpt2/data_utils/tokenization_gpt2.py
Executable file
@@ -0,0 +1,359 @@
|
||||
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for OpenAI GPT."""
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import json
|
||||
import regex as re
|
||||
|
||||
from .file_utils import cached_path
|
||||
|
||||
try:
|
||||
from functools import lru_cache
|
||||
except ImportError:
|
||||
# Just a dummy decorator to get the checks to run on python2
|
||||
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
|
||||
def lru_cache():
|
||||
return lambda func: func
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
||||
# 'gpt2': "/workspace/.pytorch_pretrained_bert/gpt2-vocab.json",
|
||||
'gpt2': '.pytorch_pretrained_bert/gpt2-vocab.json',
|
||||
}
|
||||
PRETRAINED_MERGES_ARCHIVE_MAP = {
|
||||
# 'gpt2': "/workspace/.pytorch_pretrained_bert/gpt2-merges.txt",
|
||||
'gpt2': '.pytorch_pretrained_bert/gpt2-merges.txt',
|
||||
}
|
||||
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
||||
'gpt2': 1024,
|
||||
}
|
||||
VOCAB_NAME = 'vocab.json'
|
||||
MERGES_NAME = 'merges.txt'
|
||||
SPECIAL_TOKENS_NAME = 'special_tokens.txt'
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
_chr = unichr if sys.version_info[0] == 2 else chr
|
||||
bs = list(range(ord('!'),
|
||||
ord('~') + 1)) + list(range(
|
||||
ord('¡'),
|
||||
ord('¬') + 1)) + list(range(ord('®'),
|
||||
ord('ÿ') + 1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs = [_chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
def get_pairs(word):
|
||||
"""Return set of symbol pairs in a word.
|
||||
|
||||
Word is represented as tuple of symbols (symbols being variable-length strings).
|
||||
"""
|
||||
pairs = set()
|
||||
prev_char = word[0]
|
||||
for char in word[1:]:
|
||||
pairs.add((prev_char, char))
|
||||
prev_char = char
|
||||
return pairs
|
||||
|
||||
|
||||
class GPT2Tokenizer(object):
|
||||
"""
|
||||
GPT-2 BPE tokenizer. Peculiarities:
|
||||
- Byte-level BPE
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
pretrained_model_name_or_path,
|
||||
cache_dir=None,
|
||||
*inputs,
|
||||
**kwargs):
|
||||
"""
|
||||
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
"""
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
|
||||
pretrained_model_name_or_path]
|
||||
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[
|
||||
pretrained_model_name_or_path]
|
||||
special_tokens_file = None
|
||||
else:
|
||||
vocab_file = os.path.join(pretrained_model_name_or_path,
|
||||
VOCAB_NAME)
|
||||
merges_file = os.path.join(pretrained_model_name_or_path,
|
||||
MERGES_NAME)
|
||||
special_tokens_file = os.path.join(pretrained_model_name_or_path,
|
||||
SPECIAL_TOKENS_NAME)
|
||||
if not os.path.exists(special_tokens_file):
|
||||
special_tokens_file = None
|
||||
else:
|
||||
logger.info('loading special tokens file {}'.format(
|
||||
special_tokens_file))
|
||||
# redirect to the cache, if necessary
|
||||
# try:
|
||||
# resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||
# resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
|
||||
# except EnvironmentError:
|
||||
# logger.error(
|
||||
# "Model name '{}' was not found in model name list ({}). "
|
||||
# "We assumed '{}' was a path or url but couldn't find files {} and {} "
|
||||
# "at this path or url.".format(
|
||||
# pretrained_model_name_or_path,
|
||||
# ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
||||
# pretrained_model_name_or_path,
|
||||
# vocab_file, merges_file))
|
||||
# return None
|
||||
# if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
|
||||
# logger.info("loading vocabulary file {}".format(vocab_file))
|
||||
# logger.info("loading merges file {}".format(merges_file))
|
||||
# else:
|
||||
# logger.info("loading vocabulary file {} from cache at {}".format(
|
||||
# vocab_file, resolved_vocab_file))
|
||||
# logger.info("loading merges file {} from cache at {}".format(
|
||||
# merges_file, resolved_merges_file))
|
||||
resolved_vocab_file = vocab_file
|
||||
resolved_merges_file = merges_file
|
||||
logger.info('loading vocabulary file {}'.format(vocab_file))
|
||||
logger.info('loading merges file {}'.format(merges_file))
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
|
||||
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
|
||||
# than the number of positional embeddings
|
||||
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
|
||||
pretrained_model_name_or_path]
|
||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
||||
# Instantiate tokenizer.
|
||||
if special_tokens_file and 'special_tokens' not in kwargs:
|
||||
special_tokens = open(
|
||||
special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
|
||||
else:
|
||||
special_tokens = kwargs.pop('special_tokens', [])
|
||||
tokenizer = cls(
|
||||
resolved_vocab_file,
|
||||
resolved_merges_file,
|
||||
special_tokens=special_tokens,
|
||||
*inputs,
|
||||
**kwargs)
|
||||
return tokenizer
|
||||
|
||||
def __init__(self,
|
||||
vocab_file,
|
||||
merges_file,
|
||||
errors='replace',
|
||||
special_tokens=None,
|
||||
max_len=None):
|
||||
self.max_len = max_len if max_len is not None else int(1e12)
|
||||
self.encoder = json.load(open(vocab_file))
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
self.cache = {}
|
||||
|
||||
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(
|
||||
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
||||
)
|
||||
|
||||
self.special_tokens = {}
|
||||
self.special_tokens_decoder = {}
|
||||
self.set_special_tokens(special_tokens)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.encoder) + len(self.special_tokens)
|
||||
|
||||
def set_special_tokens(self, special_tokens):
|
||||
""" Add a list of additional tokens to the encoder.
|
||||
The additional tokens are indexed starting from the last index of the
|
||||
current vocabulary in the order of the `special_tokens` list.
|
||||
"""
|
||||
if not special_tokens:
|
||||
self.special_tokens = {}
|
||||
self.special_tokens_decoder = {}
|
||||
return
|
||||
self.special_tokens = dict((tok, len(self.encoder) + i)
|
||||
for i, tok in enumerate(special_tokens))
|
||||
self.special_tokens_decoder = {
|
||||
v: k
|
||||
for k, v in self.special_tokens.items()
|
||||
}
|
||||
logger.info('Special tokens {}'.format(self.special_tokens))
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
word = tuple(token)
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
return token
|
||||
|
||||
while True:
|
||||
bigram = min(
|
||||
pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except: # noqa
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word) - 1 and word[
|
||||
i + 1] == second:
|
||||
new_word.append(first + second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = ' '.join(word)
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def tokenize(self, text):
|
||||
""" Tokenize a string. """
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
if sys.version_info[0] == 2:
|
||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||
else:
|
||||
token = ''.join(self.byte_encoder[b]
|
||||
for b in token.encode('utf-8'))
|
||||
bpe_tokens.extend(
|
||||
bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||
return bpe_tokens
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
""" Converts a sequence of tokens into ids using the vocab. """
|
||||
ids = []
|
||||
if isinstance(tokens, str) or (sys.version_info[0] == 2
|
||||
and isinstance(tokens, unicode)):
|
||||
if tokens in self.special_tokens:
|
||||
return self.special_tokens[tokens]
|
||||
else:
|
||||
return self.encoder.get(tokens, 0)
|
||||
for token in tokens:
|
||||
if token in self.special_tokens:
|
||||
ids.append(self.special_tokens[token])
|
||||
else:
|
||||
ids.append(self.encoder.get(token, 0))
|
||||
if len(ids) > self.max_len:
|
||||
logger.warning(
|
||||
'Token indices sequence length is longer than the specified maximum '
|
||||
' sequence length for this OpenAI GPT model ({} > {}). Running this'
|
||||
' sequence through the model will result in indexing errors'.
|
||||
format(len(ids), self.max_len))
|
||||
return ids
|
||||
|
||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||
"""Converts a sequence of ids in BPE tokens using the vocab."""
|
||||
tokens = []
|
||||
for i in ids:
|
||||
if i in self.special_tokens_decoder:
|
||||
if not skip_special_tokens:
|
||||
tokens.append(self.special_tokens_decoder[i])
|
||||
else:
|
||||
tokens.append(self.decoder[i])
|
||||
return tokens
|
||||
|
||||
def encode(self, text):
|
||||
return self.convert_tokens_to_ids(self.tokenize(text))
|
||||
|
||||
def decode(self, tokens):
|
||||
text = ''.join([self.decoder[token] for token in tokens])
|
||||
text = bytearray([self.byte_decoder[c] for c in text]).decode(
|
||||
'utf-8', errors=self.errors)
|
||||
return text
|
||||
|
||||
def save_vocabulary(self, vocab_path):
|
||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
||||
if not os.path.isdir(vocab_path):
|
||||
logger.error('Vocabulary path ({}) should be a directory'.format(
|
||||
vocab_path))
|
||||
return
|
||||
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
|
||||
merge_file = os.path.join(vocab_path, MERGES_NAME)
|
||||
special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
|
||||
|
||||
with open(vocab_file, 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
||||
|
||||
index = 0
|
||||
with open(merge_file, 'w', encoding='utf-8') as writer:
|
||||
writer.write(u'#version: 0.2\n')
|
||||
for bpe_tokens, token_index in sorted(
|
||||
self.bpe_ranks.items(), key=lambda kv: kv[1]):
|
||||
if index != token_index:
|
||||
logger.warning(
|
||||
'Saving vocabulary to {}: BPE merge indices are not consecutive.'
|
||||
' Please check that the tokenizer is not corrupted!'.
|
||||
format(merge_file))
|
||||
index = token_index
|
||||
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||
index += 1
|
||||
|
||||
index = len(self.encoder)
|
||||
with open(special_tokens_file, 'w', encoding='utf-8') as writer:
|
||||
for token, token_index in sorted(
|
||||
self.special_tokens.items(), key=lambda kv: kv[1]):
|
||||
if index != token_index:
|
||||
logger.warning(
|
||||
'Saving special tokens vocabulary to {}: BPE indices are not consecutive.'
|
||||
' Please check that the tokenizer is not corrupted!'.
|
||||
format(special_tokens_file))
|
||||
index = token_index
|
||||
writer.write(token + u'\n')
|
||||
index += 1
|
||||
|
||||
return vocab_file, merge_file, special_tokens_file
|
||||
408
modelscope/models/nlp/txl_poem/gpt2/data_utils/wordpiece.py
Executable file
408
modelscope/models/nlp/txl_poem/gpt2/data_utils/wordpiece.py
Executable file
@@ -0,0 +1,408 @@
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py""" # noqa
|
||||
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
import collections
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from io import open
|
||||
|
||||
from .file_utils import cached_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
||||
'bert-base-uncased':
|
||||
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt',
|
||||
'bert-large-uncased':
|
||||
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt',
|
||||
'bert-base-cased':
|
||||
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt',
|
||||
'bert-large-cased':
|
||||
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt',
|
||||
'bert-base-multilingual-uncased':
|
||||
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt',
|
||||
'bert-base-multilingual-cased':
|
||||
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt',
|
||||
'bert-base-chinese':
|
||||
'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt',
|
||||
}
|
||||
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
||||
'bert-base-uncased': 512,
|
||||
'bert-large-uncased': 512,
|
||||
'bert-base-cased': 512,
|
||||
'bert-large-cased': 512,
|
||||
'bert-base-multilingual-uncased': 512,
|
||||
'bert-base-multilingual-cased': 512,
|
||||
'bert-base-chinese': 512,
|
||||
}
|
||||
VOCAB_NAME = 'vocab.txt'
|
||||
|
||||
|
||||
def load_vocab(vocab_file):
|
||||
"""Loads a vocabulary file into a dictionary."""
|
||||
vocab = collections.OrderedDict()
|
||||
index = 0
|
||||
with open(vocab_file, 'r', encoding='utf-8') as reader:
|
||||
while True:
|
||||
token = reader.readline()
|
||||
if not token:
|
||||
break
|
||||
token = token.strip()
|
||||
vocab[token] = index
|
||||
index += 1
|
||||
return vocab
|
||||
|
||||
|
||||
def whitespace_tokenize(text):
|
||||
"""Runs basic whitespace cleaning and splitting on a piece of text."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
tokens = text.split()
|
||||
return tokens
|
||||
|
||||
|
||||
class BertTokenizer(object):
|
||||
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
|
||||
|
||||
def __init__(self,
|
||||
vocab_file,
|
||||
do_lower_case=True,
|
||||
max_len=None,
|
||||
do_basic_tokenize=True,
|
||||
never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
|
||||
"""Constructs a BertTokenizer.
|
||||
|
||||
Args:
|
||||
vocab_file: Path to a one-wordpiece-per-line vocabulary file
|
||||
do_lower_case: Whether to lower case the input
|
||||
Only has an effect when do_wordpiece_only=False
|
||||
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
|
||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
||||
Effective maximum length is always the minimum of this
|
||||
value (if specified) and the underlying BERT model's
|
||||
sequence length.
|
||||
never_split: List of tokens which will never be split during tokenization.
|
||||
Only has an effect when do_wordpiece_only=False
|
||||
"""
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
|
||||
'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
|
||||
.format(vocab_file))
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict([
|
||||
(ids, tok) for tok, ids in self.vocab.items()
|
||||
])
|
||||
self.do_basic_tokenize = do_basic_tokenize
|
||||
if do_basic_tokenize:
|
||||
self.basic_tokenizer = BasicTokenizer(
|
||||
do_lower_case=do_lower_case, never_split=never_split)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
|
||||
self.max_len = max_len if max_len is not None else int(1e12)
|
||||
|
||||
def tokenize(self, text):
|
||||
if self.do_basic_tokenize:
|
||||
split_tokens = []
|
||||
for token in self.basic_tokenizer.tokenize(text):
|
||||
for sub_token in self.wordpiece_tokenizer.tokenize(token):
|
||||
split_tokens.append(sub_token)
|
||||
else:
|
||||
split_tokens = self.wordpiece_tokenizer.tokenize(text)
|
||||
return split_tokens
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
"""Converts a sequence of tokens into ids using the vocab."""
|
||||
ids = []
|
||||
for token in tokens:
|
||||
ids.append(self.vocab[token])
|
||||
if len(ids) > self.max_len:
|
||||
logger.warning(
|
||||
'Token indices sequence length is longer than the specified maximum '
|
||||
' sequence length for this BERT model ({} > {}). Running this'
|
||||
' sequence through BERT will result in indexing errors'.format(
|
||||
len(ids), self.max_len))
|
||||
return ids
|
||||
|
||||
def convert_ids_to_tokens(self, ids):
|
||||
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
|
||||
tokens = []
|
||||
for i in ids:
|
||||
tokens.append(self.ids_to_tokens[i])
|
||||
return tokens
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
pretrained_model_name_or_path,
|
||||
cache_dir=None,
|
||||
*inputs,
|
||||
**kwargs):
|
||||
"""
|
||||
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
"""
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
|
||||
pretrained_model_name_or_path]
|
||||
else:
|
||||
vocab_file = pretrained_model_name_or_path
|
||||
if os.path.isdir(vocab_file):
|
||||
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find any file "
|
||||
'associated to this path or url.'.format(
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
||||
vocab_file))
|
||||
return None
|
||||
if resolved_vocab_file == vocab_file:
|
||||
logger.info('loading vocabulary file {}'.format(vocab_file))
|
||||
else:
|
||||
logger.info('loading vocabulary file {} from cache at {}'.format(
|
||||
vocab_file, resolved_vocab_file))
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
|
||||
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
|
||||
# than the number of positional embeddings
|
||||
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
|
||||
pretrained_model_name_or_path]
|
||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
||||
# Instantiate tokenizer.
|
||||
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
class BasicTokenizer(object):
|
||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
||||
|
||||
def __init__(self,
|
||||
do_lower_case=True,
|
||||
never_split=('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')):
|
||||
"""Constructs a BasicTokenizer.
|
||||
|
||||
Args:
|
||||
do_lower_case: Whether to lower case the input.
|
||||
"""
|
||||
self.do_lower_case = do_lower_case
|
||||
self.never_split = never_split
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Tokenizes a piece of text."""
|
||||
text = self._clean_text(text)
|
||||
# This was added on November 1st, 2018 for the multilingual and Chinese
|
||||
# models. This is also applied to the English models now, but it doesn't
|
||||
# matter since the English models were not trained on any Chinese data
|
||||
# and generally don't have any Chinese data in them (there are Chinese
|
||||
# characters in the vocabulary because Wikipedia does have some Chinese
|
||||
# words in the English Wikipedia.).
|
||||
text = self._tokenize_chinese_chars(text)
|
||||
orig_tokens = whitespace_tokenize(text)
|
||||
split_tokens = []
|
||||
for token in orig_tokens:
|
||||
if self.do_lower_case and token not in self.never_split:
|
||||
token = token.lower()
|
||||
token = self._run_strip_accents(token)
|
||||
split_tokens.extend(self._run_split_on_punc(token))
|
||||
|
||||
output_tokens = whitespace_tokenize(' '.join(split_tokens))
|
||||
return output_tokens
|
||||
|
||||
def _run_strip_accents(self, text):
|
||||
"""Strips accents from a piece of text."""
|
||||
text = unicodedata.normalize('NFD', text)
|
||||
output = []
|
||||
for char in text:
|
||||
cat = unicodedata.category(char)
|
||||
if cat == 'Mn':
|
||||
continue
|
||||
output.append(char)
|
||||
return ''.join(output)
|
||||
|
||||
def _run_split_on_punc(self, text):
|
||||
"""Splits punctuation on a piece of text."""
|
||||
if text in self.never_split:
|
||||
return [text]
|
||||
chars = list(text)
|
||||
i = 0
|
||||
start_new_word = True
|
||||
output = []
|
||||
while i < len(chars):
|
||||
char = chars[i]
|
||||
if _is_punctuation(char):
|
||||
output.append([char])
|
||||
start_new_word = True
|
||||
else:
|
||||
if start_new_word:
|
||||
output.append([])
|
||||
start_new_word = False
|
||||
output[-1].append(char)
|
||||
i += 1
|
||||
|
||||
return [''.join(x) for x in output]
|
||||
|
||||
def _tokenize_chinese_chars(self, text):
|
||||
"""Adds whitespace around any CJK character."""
|
||||
output = []
|
||||
for char in text:
|
||||
cp = ord(char)
|
||||
if self._is_chinese_char(cp):
|
||||
output.append(' ')
|
||||
output.append(char)
|
||||
output.append(' ')
|
||||
else:
|
||||
output.append(char)
|
||||
return ''.join(output)
|
||||
|
||||
def _is_chinese_char(self, cp):
|
||||
"""Checks whether CP is the codepoint of a CJK character."""
|
||||
# This defines a "chinese character" as anything in the CJK Unicode block:
|
||||
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
||||
#
|
||||
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
||||
# despite its name. The modern Korean Hangul alphabet is a different block,
|
||||
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
||||
# space-separated words, so they are not treated specially and handled
|
||||
# like the all of the other languages.
|
||||
if ((cp >= 0x4E00 and cp <= 0x9FFF) or # noqa
|
||||
(cp >= 0x3400 and cp <= 0x4DBF) or # noqa
|
||||
(cp >= 0x20000 and cp <= 0x2A6DF) or # noqa
|
||||
(cp >= 0x2A700 and cp <= 0x2B73F) or # noqa
|
||||
(cp >= 0x2B740 and cp <= 0x2B81F) or # noqa
|
||||
(cp >= 0x2B820 and cp <= 0x2CEAF) or # noqa
|
||||
(cp >= 0xF900 and cp <= 0xFAFF) or # noqa
|
||||
(cp >= 0x2F800 and cp <= 0x2FA1F)): # noqa
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _clean_text(self, text):
|
||||
"""Performs invalid character removal and whitespace cleanup on text."""
|
||||
output = []
|
||||
for char in text:
|
||||
cp = ord(char)
|
||||
if cp == 0 or cp == 0xfffd or _is_control(char):
|
||||
continue
|
||||
if _is_whitespace(char):
|
||||
output.append(' ')
|
||||
else:
|
||||
output.append(char)
|
||||
return ''.join(output)
|
||||
|
||||
|
||||
class WordpieceTokenizer(object):
|
||||
"""Runs WordPiece tokenization."""
|
||||
|
||||
def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=100):
|
||||
self.vocab = vocab
|
||||
self.unk_token = unk_token
|
||||
self.max_input_chars_per_word = max_input_chars_per_word
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Tokenizes a piece of text into its word pieces.
|
||||
|
||||
This uses a greedy longest-match-first algorithm to perform tokenization
|
||||
using the given vocabulary.
|
||||
|
||||
For example:
|
||||
input = "unaffable"
|
||||
output = ["un", "##aff", "##able"]
|
||||
|
||||
Args:
|
||||
text: A single token or whitespace separated tokens. This should have
|
||||
already been passed through `BasicTokenizer`.
|
||||
|
||||
Returns:
|
||||
A list of wordpiece tokens.
|
||||
"""
|
||||
|
||||
output_tokens = []
|
||||
for token in whitespace_tokenize(text):
|
||||
chars = list(token)
|
||||
if len(chars) > self.max_input_chars_per_word:
|
||||
output_tokens.append(self.unk_token)
|
||||
continue
|
||||
|
||||
is_bad = False
|
||||
start = 0
|
||||
sub_tokens = []
|
||||
while start < len(chars):
|
||||
end = len(chars)
|
||||
cur_substr = None
|
||||
while start < end:
|
||||
substr = ''.join(chars[start:end])
|
||||
if start > 0:
|
||||
substr = '##' + substr
|
||||
if substr in self.vocab:
|
||||
cur_substr = substr
|
||||
break
|
||||
end -= 1
|
||||
if cur_substr is None:
|
||||
is_bad = True
|
||||
break
|
||||
sub_tokens.append(cur_substr)
|
||||
start = end
|
||||
|
||||
if is_bad:
|
||||
output_tokens.append(self.unk_token)
|
||||
else:
|
||||
output_tokens.extend(sub_tokens)
|
||||
return output_tokens
|
||||
|
||||
|
||||
def _is_whitespace(char):
|
||||
"""Checks whether `chars` is a whitespace character."""
|
||||
# \t, \n, and \r are technically contorl characters but we treat them
|
||||
# as whitespace since they are generally considered as such.
|
||||
if char == ' ' or char == '\t' or char == '\n' or char == '\r':
|
||||
return True
|
||||
cat = unicodedata.category(char)
|
||||
if cat == 'Zs':
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_control(char):
|
||||
"""Checks whether `chars` is a control character."""
|
||||
# These are technically control characters but we count them as whitespace
|
||||
# characters.
|
||||
if char == '\t' or char == '\n' or char == '\r':
|
||||
return False
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith('C'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_punctuation(char):
|
||||
"""Checks whether `chars` is a punctuation character."""
|
||||
cp = ord(char)
|
||||
# We treat all non-letter/number ASCII as punctuation.
|
||||
# Characters such as "^", "$", and "`" are not in the Unicode
|
||||
# Punctuation class but we treat them as punctuation anyways, for
|
||||
# consistency.
|
||||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
|
||||
or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
|
||||
return True
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith('P'):
|
||||
return True
|
||||
return False
|
||||
20
modelscope/models/nlp/txl_poem/gpt2/fp16/__init__.py
Executable file
20
modelscope/models/nlp/txl_poem/gpt2/fp16/__init__.py
Executable file
@@ -0,0 +1,20 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .fp16 import * # noqa
|
||||
from .fp16util import (BN_convert_float, FP16Model, clip_grad_norm,
|
||||
convert_module, convert_network,
|
||||
master_params_to_model_params,
|
||||
model_grads_to_master_grads, network_to_half,
|
||||
prep_param_lists, to_python_float, tofp16)
|
||||
from .loss_scaler import * # noqa
|
||||
657
modelscope/models/nlp/txl_poem/gpt2/fp16/fp16.py
Executable file
657
modelscope/models/nlp/txl_poem/gpt2/fp16/fp16.py
Executable file
@@ -0,0 +1,657 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Stable version of apex FP16 Optimizer"""
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
from torch.autograd import Variable
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from .fp16util import (clip_grad_norm, master_params_to_model_params,
|
||||
model_grads_to_master_grads)
|
||||
from .loss_scaler import DynamicLossScaler, LossScaler
|
||||
|
||||
FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
|
||||
HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
|
||||
|
||||
|
||||
def conversion_helper(val, conversion):
|
||||
"""Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
|
||||
if not isinstance(val, (tuple, list)):
|
||||
return conversion(val)
|
||||
rtn = [conversion_helper(v, conversion) for v in val]
|
||||
if isinstance(val, tuple):
|
||||
rtn = tuple(rtn)
|
||||
return rtn
|
||||
|
||||
|
||||
def fp32_to_fp16(val):
|
||||
"""Convert fp32 `val` to fp16"""
|
||||
|
||||
def half_conversion(val):
|
||||
val_typecheck = val
|
||||
if isinstance(val_typecheck, (Parameter, Variable)):
|
||||
val_typecheck = val.data
|
||||
if isinstance(val_typecheck, FLOAT_TYPES):
|
||||
val = val.half()
|
||||
return val
|
||||
|
||||
return conversion_helper(val, half_conversion)
|
||||
|
||||
|
||||
def fp16_to_fp32(val):
|
||||
"""Convert fp16 `val` to fp32"""
|
||||
|
||||
def float_conversion(val):
|
||||
val_typecheck = val
|
||||
if isinstance(val_typecheck, (Parameter, Variable)):
|
||||
val_typecheck = val.data
|
||||
if isinstance(val_typecheck, HALF_TYPES):
|
||||
val = val.float()
|
||||
return val
|
||||
|
||||
return conversion_helper(val, float_conversion)
|
||||
|
||||
|
||||
class FP16_Module(nn.Module):
|
||||
|
||||
def __init__(self, module):
|
||||
super(FP16_Module, self).__init__()
|
||||
self.add_module('module', module.half())
|
||||
|
||||
def forward(self, *inputs, **kwargs):
|
||||
return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
|
||||
|
||||
def state_dict(self, destination=None, prefix='', keep_vars=False):
|
||||
return self.module.state_dict(destination, prefix, keep_vars)
|
||||
|
||||
def load_state_dict(self, state_dict, strict=True):
|
||||
self.module.load_state_dict(state_dict, strict=strict)
|
||||
|
||||
|
||||
# TODO: Update overflow check + downscale to use Carl's fused kernel.
|
||||
class FP16_Optimizer(object):
|
||||
"""
|
||||
:class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
|
||||
and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
|
||||
For standard use, only two lines must be changed: creating the :class:`FP16_Optimizer` instance,
|
||||
and changing the call to ``backward``.
|
||||
|
||||
Example::
|
||||
|
||||
model = torch.nn.Linear(D_in, D_out).cuda().half()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
|
||||
# Name the FP16_Optimizer instance to replace the existing optimizer
|
||||
# (recommended but not required):
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
|
||||
...
|
||||
# loss.backward() becomes:
|
||||
optimizer.backward(loss)
|
||||
...
|
||||
|
||||
Example with dynamic loss scaling::
|
||||
|
||||
...
|
||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
# optional arg to control dynamic loss scaling behavior
|
||||
# dynamic_loss_args={'scale_window' : 500})
|
||||
# Usually, dynamic_loss_args is not necessary.
|
||||
|
||||
Args:
|
||||
init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`. # noqa
|
||||
static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate. # noqa
|
||||
dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option. # noqa
|
||||
dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. # noqa
|
||||
verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. # noqa
|
||||
|
||||
``init_optimizer`` is expected to have been constructed in the ordinary way.
|
||||
It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
|
||||
named to replace ``init_optimizer``, for two reasons:
|
||||
First, it means that references to the same name
|
||||
later in the file will not have to change.
|
||||
Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
|
||||
modify ``init_optimizer``. If you do choose a unique name for the new
|
||||
:class:`FP16_Optimizer` instance, you should only work with this new instance,
|
||||
because the preexisting optimizer might no longer behave as expected.
|
||||
|
||||
``init_optimizer`` may be any Pytorch optimizer.
|
||||
It may contain a mixture of fp16 and fp32 parameters organized into any number of
|
||||
``param_groups`` with different hyperparameters. The :class:`FP16_Optimizer` constructor will
|
||||
ingest these ``param_groups`` and remember them.
|
||||
|
||||
Calls to ::
|
||||
|
||||
loss.backward()
|
||||
|
||||
must be replaced with ::
|
||||
|
||||
optimizer.backward(loss)
|
||||
|
||||
because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
|
||||
loss scaling and copies to master gradients.
|
||||
|
||||
.. note::
|
||||
Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
|
||||
are downscaled before being applied. This means that adjusting the loss scale, or using
|
||||
dynamic loss scaling, should not require retuning the learning rate or any other
|
||||
hyperparameters.
|
||||
|
||||
|
||||
**Advanced options**
|
||||
|
||||
**Closures**: :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
|
||||
See docstring for :attr:`step`.
|
||||
|
||||
**Gradient clipping**: Use :attr:`clip_master_grads`.
|
||||
|
||||
**Multiple losses**: If your model accumulates gradients from multiple losses,
|
||||
this can be made more efficient by supplying ``update_master_grads=False``
|
||||
to :attr:`backward`. See docstring for :attr:`backward`.
|
||||
|
||||
**Manually adjusting loss scale**: The current loss scale can be retrieved or set via ::
|
||||
|
||||
print(optimizer.loss_scale)
|
||||
optimizer.loss_scale = new_loss_scale
|
||||
|
||||
For static loss scaling, manually adjusting the loss scale over time is a reasonable
|
||||
thing to do. During later epochs, gradients may become smaller, and a
|
||||
higher loss scale may be required, analogous to scheduling the learning rate. Dynamic loss
|
||||
scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
|
||||
the loss scale is not recommended.
|
||||
|
||||
**Multi_GPU training**: If the wrapped ``init_optimizer`` was created from a model wrapped in
|
||||
Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
|
||||
should still work as intended.
|
||||
""" # noqa
|
||||
|
||||
def __init__(self,
|
||||
init_optimizer,
|
||||
static_loss_scale=1.0,
|
||||
dynamic_loss_scale=False,
|
||||
dynamic_loss_args=None,
|
||||
verbose=False):
|
||||
if not torch.cuda.is_available:
|
||||
raise SystemError('Cannot use fp16 without CUDA.')
|
||||
|
||||
self.verbose = verbose
|
||||
|
||||
self.optimizer = init_optimizer
|
||||
# init_state_dict sets up an alternative way to cast per-param state tensors.
|
||||
# Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
|
||||
# init_state_dict = init_optimizer.state_dict()
|
||||
|
||||
self.fp16_groups = []
|
||||
self.fp32_from_fp16_groups = []
|
||||
self.fp32_from_fp32_groups = []
|
||||
for i, param_group in enumerate(self.optimizer.param_groups):
|
||||
self.maybe_print(
|
||||
'FP16_Optimizer processing param group {}:'.format(i))
|
||||
fp16_params_this_group = []
|
||||
fp32_params_this_group = []
|
||||
fp32_from_fp16_params_this_group = []
|
||||
for i, param in enumerate(param_group['params']):
|
||||
if param.requires_grad:
|
||||
if param.type() == 'torch.cuda.HalfTensor':
|
||||
self.maybe_print(
|
||||
'FP16_Optimizer received torch.cuda.HalfTensor with {}'
|
||||
.format(param.size()))
|
||||
fp16_params_this_group.append(param)
|
||||
master_param = param.detach().clone().float()
|
||||
master_param.requires_grad = True
|
||||
# Copythe model parallel flag.
|
||||
master_param.model_parallel = param.model_parallel
|
||||
param_group['params'][i] = master_param
|
||||
fp32_from_fp16_params_this_group.append(master_param)
|
||||
# Reset existing state dict key to the new master param.
|
||||
# We still need to recast per-param state tensors, if any, to FP32.
|
||||
if param in self.optimizer.state:
|
||||
self.optimizer.state[
|
||||
master_param] = self.optimizer.state.pop(param)
|
||||
elif param.type() == 'torch.cuda.FloatTensor':
|
||||
self.maybe_print(
|
||||
'FP16_Optimizer received torch.cuda.FloatTensor with {}'
|
||||
.format(param.size()))
|
||||
fp32_params_this_group.append(param)
|
||||
param_group['params'][i] = param
|
||||
else:
|
||||
raise TypeError(
|
||||
'Wrapped parameters must be either '
|
||||
'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
|
||||
'Received {}'.format(param.type()))
|
||||
|
||||
self.fp16_groups.append(fp16_params_this_group)
|
||||
self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
|
||||
self.fp32_from_fp32_groups.append(fp32_params_this_group)
|
||||
|
||||
# Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
|
||||
self.optimizer.load_state_dict(self.optimizer.state_dict())
|
||||
# alternative way to cast per-param state tensors:
|
||||
# self.optimizer.load_state_dict(init_state_dict)
|
||||
|
||||
if dynamic_loss_scale:
|
||||
self.dynamic_loss_scale = True
|
||||
if dynamic_loss_args is not None:
|
||||
self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
|
||||
else:
|
||||
self.loss_scaler = DynamicLossScaler()
|
||||
else:
|
||||
self.dynamic_loss_scale = False
|
||||
self.loss_scaler = LossScaler(static_loss_scale)
|
||||
|
||||
self.overflow = False
|
||||
self.first_closure_call_this_step = True
|
||||
|
||||
self.clip_grad_norm = clip_grad_norm
|
||||
|
||||
def maybe_print(self, msg):
|
||||
if self.verbose:
|
||||
print(msg)
|
||||
|
||||
def __getstate__(self):
|
||||
raise RuntimeError(
|
||||
'FP16_Optimizer should be serialized using state_dict().')
|
||||
|
||||
def __setstate__(self, state):
|
||||
raise RuntimeError(
|
||||
'FP16_Optimizer should be deserialized using load_state_dict().')
|
||||
|
||||
def zero_grad(self, set_grads_to_None=False):
|
||||
"""
|
||||
Zero fp32 and fp16 parameter grads.
|
||||
"""
|
||||
# In principle, only the .grad attributes of the model params need to be zeroed,
|
||||
# because gradients are copied into the FP32 master params. However, we zero
|
||||
# all gradients owned by the optimizer, just to be safe:
|
||||
for group in self.optimizer.param_groups:
|
||||
for p in group['params']:
|
||||
if set_grads_to_None:
|
||||
p.grad = None
|
||||
else:
|
||||
if p.grad is not None:
|
||||
p.grad.detach_()
|
||||
p.grad.zero_()
|
||||
|
||||
# Zero fp16 gradients owned by the model:
|
||||
for fp16_group in self.fp16_groups:
|
||||
for param in fp16_group:
|
||||
if set_grads_to_None:
|
||||
param.grad = None
|
||||
else:
|
||||
if param.grad is not None:
|
||||
param.grad.detach_(
|
||||
) # as in torch.optim.optimizer.zero_grad()
|
||||
param.grad.zero_()
|
||||
|
||||
def _check_overflow(self):
|
||||
params = []
|
||||
for group in self.fp16_groups:
|
||||
for param in group:
|
||||
params.append(param)
|
||||
for group in self.fp32_from_fp32_groups:
|
||||
for param in group:
|
||||
params.append(param)
|
||||
self.overflow = self.loss_scaler.has_overflow(params)
|
||||
|
||||
def _update_scale(self, has_overflow=False):
|
||||
self.loss_scaler.update_scale(has_overflow)
|
||||
|
||||
def _master_params_to_model_params(self):
|
||||
for fp16_group, fp32_from_fp16_group in zip(
|
||||
self.fp16_groups, self.fp32_from_fp16_groups):
|
||||
master_params_to_model_params(fp16_group, fp32_from_fp16_group)
|
||||
|
||||
def _model_params_to_master_params(self):
|
||||
for fp16_group, fp32_from_fp16_group in zip(
|
||||
self.fp16_groups, self.fp32_from_fp16_groups):
|
||||
master_params_to_model_params(fp32_from_fp16_group, fp16_group)
|
||||
|
||||
# To consider: Integrate distributed with this wrapper by registering a hook on each variable
|
||||
# that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
|
||||
def _model_grads_to_master_grads(self):
|
||||
for fp16_group, fp32_from_fp16_group in zip(
|
||||
self.fp16_groups, self.fp32_from_fp16_groups):
|
||||
model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
|
||||
|
||||
def _downscale_master(self):
|
||||
if self.loss_scale != 1.0:
|
||||
for group in self.optimizer.param_groups:
|
||||
for param in group['params']:
|
||||
if param.grad is not None:
|
||||
param.grad.data.mul_(1. / self.loss_scale)
|
||||
|
||||
def clip_master_grads(self, max_norm, norm_type=2):
|
||||
"""
|
||||
Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
|
||||
|
||||
Args:
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
|
||||
infinity norm.
|
||||
|
||||
Returns:
|
||||
Total norm of the current fp32 gradients (viewed as a single vector).
|
||||
|
||||
.. warning::
|
||||
Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). # noqa
|
||||
""" # noqa
|
||||
if not self.overflow:
|
||||
fp32_params = []
|
||||
for param_group in self.optimizer.param_groups:
|
||||
for param in param_group['params']:
|
||||
fp32_params.append(param)
|
||||
return self.clip_grad_norm(fp32_params, max_norm, norm_type)
|
||||
else:
|
||||
return -1
|
||||
|
||||
def state_dict(self):
|
||||
"""
|
||||
Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
|
||||
This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
|
||||
of the contained Pytorch optimizer.
|
||||
Example::
|
||||
|
||||
checkpoint = {}
|
||||
checkpoint['model'] = model.state_dict()
|
||||
checkpoint['optimizer'] = optimizer.state_dict()
|
||||
torch.save(checkpoint, "saved.pth")
|
||||
"""
|
||||
state_dict = {}
|
||||
state_dict['loss_scaler'] = self.loss_scaler
|
||||
state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
|
||||
state_dict['overflow'] = self.overflow
|
||||
state_dict[
|
||||
'first_closure_call_this_step'] = self.first_closure_call_this_step
|
||||
state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
|
||||
state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
|
||||
return state_dict
|
||||
|
||||
def load_state_dict(self, state_dict):
|
||||
"""
|
||||
Loads a state_dict created by an earlier call to state_dict().
|
||||
If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
|
||||
whose parameters in turn came from ``model``, it is expected that the user
|
||||
will call ``model.load_state_dict()`` before
|
||||
``fp16_optimizer_instance.load_state_dict()`` is called.
|
||||
|
||||
Example::
|
||||
|
||||
model = torch.nn.Linear(D_in, D_out).cuda().half()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
|
||||
...
|
||||
checkpoint = torch.load("saved.pth")
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
"""
|
||||
# I think it should actually be ok to reload the optimizer before the model.
|
||||
self.loss_scaler = state_dict['loss_scaler']
|
||||
self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
|
||||
self.overflow = state_dict['overflow']
|
||||
self.first_closure_call_this_step = state_dict[
|
||||
'first_closure_call_this_step']
|
||||
self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
|
||||
# At this point, the optimizer's references to the model's fp32 parameters are up to date.
|
||||
# The optimizer's hyperparameters and internal buffers are also up to date.
|
||||
# However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
|
||||
# out of date. There are two options.
|
||||
# 1: Refresh the master params from the model's fp16 params.
|
||||
# This requires less storage but incurs precision loss.
|
||||
# 2: Save and restore the fp32 master copies separately.
|
||||
# We choose option 2.
|
||||
#
|
||||
# Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
|
||||
# of their associated parameters, because it's possible those buffers might not exist yet in
|
||||
# the current optimizer instance. In our case, as long as the current FP16_Optimizer has been
|
||||
# constructed in the same way as the one whose state_dict we are loading, the same master params
|
||||
# are guaranteed to exist, so we can just copy_() from the saved master params.
|
||||
for current_group, saved_group in zip(self.fp32_from_fp16_groups,
|
||||
state_dict['fp32_from_fp16']):
|
||||
for current, saved in zip(current_group, saved_group):
|
||||
current.data.copy_(saved.data)
|
||||
|
||||
def step(self, closure=None): # could add clip option.
|
||||
"""
|
||||
If no closure is supplied, :attr:`step` should be called after
|
||||
``fp16_optimizer_obj.backward(loss)``.
|
||||
:attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
|
||||
:class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
|
||||
originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
|
||||
another forward pass using their model.
|
||||
|
||||
If a closure is supplied, :attr:`step` may be called without a prior call to
|
||||
:attr:`backward(loss)`.
|
||||
This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
|
||||
However, the user should take care that any ``loss.backward()`` call within the closure
|
||||
has been replaced by ``fp16_optimizer_obj.backward(loss)``.
|
||||
|
||||
Args:
|
||||
closure (optional): Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor. closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. # noqa
|
||||
|
||||
Example with closure::
|
||||
|
||||
# optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
|
||||
# existing pytorch optimizer.
|
||||
for input, target in dataset:
|
||||
def closure():
|
||||
optimizer.zero_grad()
|
||||
output = model(input)
|
||||
loss = loss_fn(output, target)
|
||||
# loss.backward() becomes:
|
||||
optimizer.backward(loss)
|
||||
return loss
|
||||
optimizer.step(closure)
|
||||
|
||||
.. warning::
|
||||
Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
|
||||
|
||||
.. _`ordinary Pytorch optimizer use`:
|
||||
http://pytorch.org/docs/master/optim.html#optimizer-step-closure
|
||||
"""
|
||||
|
||||
scale = self.loss_scaler.loss_scale
|
||||
self._update_scale(self.overflow)
|
||||
|
||||
if self.overflow:
|
||||
self.maybe_print(
|
||||
'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
|
||||
.format(scale, self.loss_scale))
|
||||
return
|
||||
|
||||
if closure is not None:
|
||||
retval = self._step_with_closure(closure)
|
||||
else:
|
||||
retval = self.optimizer.step()
|
||||
|
||||
self._master_params_to_model_params()
|
||||
|
||||
return retval
|
||||
|
||||
def _step_with_closure(self, closure):
|
||||
|
||||
def wrapped_closure():
|
||||
# helpful for debugging
|
||||
# print("Calling wrapped_closure, first_closure_call_this_step = {}"
|
||||
# .format(self.first_closure_call_this_step))
|
||||
if self.first_closure_call_this_step:
|
||||
# We expect that the fp16 params are initially fresh on entering self.step(),
|
||||
# so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
|
||||
# is called within self.optimizer.step().
|
||||
self.first_closure_call_this_step = False
|
||||
else:
|
||||
# If self.optimizer.step() internally calls wrapped_closure more than once,
|
||||
# it may update the fp32 params after each call. However, self.optimizer
|
||||
# doesn't know about the fp16 params at all. If the fp32 params get updated,
|
||||
# we can't rely on self.optimizer to refresh the fp16 params. We need
|
||||
# to handle that manually:
|
||||
self._master_params_to_model_params()
|
||||
# Our API expects the user to give us ownership of the backward() call by
|
||||
# replacing all calls to loss.backward() with optimizer.backward(loss).
|
||||
# This requirement holds whether or not the call to backward() is made within a closure.
|
||||
# If the user is properly calling optimizer.backward(loss) within "closure,"
|
||||
# calling closure() here will give the fp32 master params fresh gradients
|
||||
# for the optimizer to play with, so all wrapped_closure needs to do is call
|
||||
# closure() and return the loss.
|
||||
temp_loss = closure()
|
||||
while (self.overflow):
|
||||
scale = self.loss_scaler.loss_scale
|
||||
self._update_scale(self.overflow)
|
||||
self.maybe_print(
|
||||
'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
|
||||
'reducing to {}'.format(scale, self.loss_scale))
|
||||
temp_loss = closure()
|
||||
return temp_loss
|
||||
|
||||
retval = self.optimizer.step(wrapped_closure)
|
||||
|
||||
self.first_closure_call_this_step = True
|
||||
|
||||
return retval
|
||||
|
||||
def backward(self, loss, update_master_grads=True, retain_graph=False):
|
||||
"""
|
||||
:attr:`backward` performs the following conceptual steps:
|
||||
|
||||
1. fp32_loss = loss.float() (see first Note below)
|
||||
2. scaled_loss = fp32_loss*loss_scale
|
||||
3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). # noqa
|
||||
4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. # noqa
|
||||
5. Finally, master grads are divided by loss_scale.
|
||||
|
||||
In this way, after :attr:`backward`, the master params have fresh gradients,
|
||||
and :attr:`step` may be called.
|
||||
|
||||
.. note::
|
||||
:attr:`backward` internally converts the loss to fp32 before applying the loss scale.
|
||||
This provides some additional safety against overflow if the user has supplied an
|
||||
fp16 loss value.
|
||||
However, for maximum overflow safety, the user should
|
||||
compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
|
||||
:attr:`backward`.
|
||||
|
||||
.. warning::
|
||||
The gradients found in a model's leaves after the call to
|
||||
:attr:`backward` should not be regarded as valid in general,
|
||||
because it's possible
|
||||
they have been scaled (and in the case of dynamic loss scaling,
|
||||
the scale factor may change over time).
|
||||
If the user wants to inspect gradients after a call to :attr:`backward`,
|
||||
only the master gradients should be regarded as valid. These can be retrieved via
|
||||
:attr:`inspect_master_grad_data()`.
|
||||
|
||||
Args:
|
||||
loss: The loss output by the user's model. loss may be either float or half (but see first Note above).
|
||||
update_master_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
|
||||
retain_graph (bool, optional, default=False): Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``. If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
|
||||
|
||||
Example::
|
||||
|
||||
# Ordinary operation:
|
||||
optimizer.backward(loss)
|
||||
|
||||
# Naive operation with multiple losses (technically valid, but less efficient):
|
||||
# fp32 grads will be correct after the second call, but
|
||||
# the first call incurs an unnecessary fp16->fp32 grad copy.
|
||||
optimizer.backward(loss1)
|
||||
optimizer.backward(loss2)
|
||||
|
||||
# More efficient way to handle multiple losses:
|
||||
# The fp16->fp32 grad copy is delayed until fp16 grads from all
|
||||
# losses have been accumulated.
|
||||
optimizer.backward(loss1, update_master_grads=False)
|
||||
optimizer.backward(loss2, update_master_grads=False)
|
||||
optimizer.update_master_grads()
|
||||
""" # noqa
|
||||
# To consider: try multiple backward passes using retain_grad=True to find
|
||||
# a loss scale that works. After you find a loss scale that works, do a final dummy
|
||||
# backward pass with retain_graph=False to tear down the graph. Doing this would avoid
|
||||
# discarding the iteration, but probably wouldn't improve overall efficiency.
|
||||
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
|
||||
if update_master_grads:
|
||||
self.update_master_grads()
|
||||
|
||||
def update_master_grads(self):
|
||||
"""
|
||||
Copy the ``.grad`` attribute from stored references to fp16 parameters to
|
||||
the ``.grad`` attribute of the fp32 master parameters that are directly
|
||||
updated by the optimizer. :attr:`update_master_grads` only needs to be called if
|
||||
``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
|
||||
"""
|
||||
if self.dynamic_loss_scale:
|
||||
self._check_overflow()
|
||||
if self.overflow: return # noqa
|
||||
self._model_grads_to_master_grads()
|
||||
self._downscale_master()
|
||||
|
||||
def inspect_master_grad_data(self):
|
||||
"""
|
||||
When running with :class:`FP16_Optimizer`,
|
||||
``.grad`` attributes of a model's fp16 leaves should not be
|
||||
regarded as truthful, because they might be scaled.
|
||||
After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
|
||||
the fp32 master params' ``.grad``
|
||||
attributes will contain valid gradients properly divided by the loss scale. However,
|
||||
because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
|
||||
nonintuitive. :attr:`inspect_master_grad_data`
|
||||
allows those gradients to be viewed with shapes corresponding to their associated model leaves.
|
||||
|
||||
Returns:
|
||||
List of lists (one list for each parameter group). The list for each parameter group
|
||||
is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
|
||||
"""
|
||||
if self.overflow:
|
||||
print(
|
||||
'Warning: calling FP16_Optimizer.inspect_master_grad_data while in an overflow state. '
|
||||
'Gradients are currently invalid (may be inf, nan, or stale). Returning None.'
|
||||
)
|
||||
return None
|
||||
else:
|
||||
# The optimizer owns only references to master params.
|
||||
master_grads_data = []
|
||||
for param_group in self.optimizer.param_groups:
|
||||
master_grads_this_group = []
|
||||
for param in param_group['params']:
|
||||
if param.grad is not None:
|
||||
master_grads_this_group.append(param.grad.data)
|
||||
else:
|
||||
master_grads_this_group.append(None)
|
||||
master_grads_data.append(master_grads_this_group)
|
||||
return master_grads_data
|
||||
|
||||
# Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
|
||||
def _get_loss_scale(self):
|
||||
return self.loss_scaler.loss_scale
|
||||
|
||||
def _set_loss_scale(self, value):
|
||||
self.loss_scaler.cur_scale = value
|
||||
|
||||
loss_scale = property(_get_loss_scale, _set_loss_scale)
|
||||
|
||||
# Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
|
||||
def _get_state(self):
|
||||
return self.optimizer.state
|
||||
|
||||
def _set_state(self, value):
|
||||
self.optimizer.state = value
|
||||
|
||||
state = property(_get_state, _set_state)
|
||||
|
||||
# Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
|
||||
# (for example, to adjust the learning rate)
|
||||
def _get_param_groups(self):
|
||||
return self.optimizer.param_groups
|
||||
|
||||
def _set_param_groups(self, value):
|
||||
self.optimizer.param_groups = value
|
||||
|
||||
param_groups = property(_get_param_groups, _set_param_groups)
|
||||
220
modelscope/models/nlp/txl_poem/gpt2/fp16/fp16util.py
Executable file
220
modelscope/models/nlp/txl_poem/gpt2/fp16/fp16util.py
Executable file
@@ -0,0 +1,220 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
from torch.autograd import Variable
|
||||
|
||||
from .. import mpu
|
||||
|
||||
|
||||
class tofp16(nn.Module):
|
||||
"""
|
||||
Utility module that implements::
|
||||
|
||||
def forward(self, input):
|
||||
return input.half()
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(tofp16, self).__init__()
|
||||
|
||||
def forward(self, input):
|
||||
return input.half()
|
||||
|
||||
|
||||
def BN_convert_float(module):
|
||||
"""
|
||||
Utility function for network_to_half().
|
||||
|
||||
Retained for legacy purposes.
|
||||
"""
|
||||
if isinstance(
|
||||
module,
|
||||
torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
|
||||
module.float()
|
||||
for child in module.children():
|
||||
BN_convert_float(child)
|
||||
return module
|
||||
|
||||
|
||||
def network_to_half(network):
|
||||
"""
|
||||
Convert model to half precision in a batchnorm-safe way.
|
||||
|
||||
Retained for legacy purposes. It is recommended to use FP16Model.
|
||||
"""
|
||||
return nn.Sequential(tofp16(), BN_convert_float(network.half()))
|
||||
|
||||
|
||||
def convert_module(module, dtype):
|
||||
"""
|
||||
Converts a module's immediate parameters and buffers to dtype.
|
||||
"""
|
||||
for param in module.parameters(recurse=False):
|
||||
if param is not None:
|
||||
if param.data.dtype.is_floating_point:
|
||||
param.data = param.data.to(dtype=dtype)
|
||||
if param._grad is not None and param._grad.data.dtype.is_floating_point:
|
||||
param._grad.data = param._grad.data.to(dtype=dtype)
|
||||
|
||||
for buf in module.buffers(recurse=False):
|
||||
if buf is not None and buf.data.dtype.is_floating_point:
|
||||
buf.data = buf.data.to(dtype=dtype)
|
||||
|
||||
|
||||
def convert_network(network, dtype):
|
||||
"""
|
||||
Converts a network's parameters and buffers to dtype.
|
||||
"""
|
||||
for module in network.modules():
|
||||
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
|
||||
) and module.affine is True:
|
||||
continue
|
||||
convert_module(module, dtype)
|
||||
return network
|
||||
|
||||
|
||||
class FP16Model(nn.Module):
|
||||
"""
|
||||
Convert model to half precision in a batchnorm-safe way.
|
||||
"""
|
||||
|
||||
def __init__(self, network):
|
||||
super(FP16Model, self).__init__()
|
||||
self.network = convert_network(network, dtype=torch.half)
|
||||
|
||||
def forward(self, *inputs):
|
||||
inputs = tuple(t.half() for t in inputs)
|
||||
return self.network(*inputs)
|
||||
|
||||
|
||||
def backwards_debug_hook(grad):
|
||||
raise RuntimeError(
|
||||
'master_params recieved a gradient in the backward pass!')
|
||||
|
||||
|
||||
def prep_param_lists(model, flat_master=False):
|
||||
"""
|
||||
Creates a list of FP32 master parameters for a given model, as in
|
||||
`Training Neural Networks with Mixed Precision: Real Examples`_.
|
||||
|
||||
Args:
|
||||
model (torch.nn.Module): Existing Pytorch model
|
||||
flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. # noqa
|
||||
Returns:
|
||||
A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. # noqa
|
||||
|
||||
Example::
|
||||
|
||||
model_params, master_params = prep_param_lists(model)
|
||||
|
||||
.. warning::
|
||||
Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. # noqa
|
||||
|
||||
.. _`Training Neural Networks with Mixed Precision: Real Examples`:
|
||||
http://on-demand.gputechconf.com/gtc/2018/video/S81012/
|
||||
""" # noqa
|
||||
model_params = [
|
||||
param for param in model.parameters() if param.requires_grad
|
||||
]
|
||||
|
||||
if flat_master:
|
||||
# Give the user some more useful error messages
|
||||
try:
|
||||
# flatten_dense_tensors returns a contiguous flat array.
|
||||
# http://pytorch.org/docs/master/_modules/torch/_utils.html
|
||||
master_params = _flatten_dense_tensors(
|
||||
[param.data for param in model_params]).float()
|
||||
except: # noqa
|
||||
print(
|
||||
'Error in prep_param_lists: model may contain a mixture of parameters '
|
||||
'of different types. Use flat_master=False, or use F16_Optimizer.'
|
||||
)
|
||||
raise
|
||||
master_params = torch.nn.Parameter(master_params)
|
||||
master_params.requires_grad = True
|
||||
# master_params.register_hook(backwards_debug_hook)
|
||||
if master_params.grad is None:
|
||||
master_params.grad = master_params.new(*master_params.size())
|
||||
return model_params, [master_params]
|
||||
else:
|
||||
master_params = [
|
||||
param.clone().float().detach() for param in model_params
|
||||
]
|
||||
for param in master_params:
|
||||
param.requires_grad = True
|
||||
return model_params, master_params
|
||||
|
||||
|
||||
def model_grads_to_master_grads(model_params,
|
||||
master_params,
|
||||
flat_master=False):
|
||||
"""
|
||||
Copy model gradients to master gradients.
|
||||
|
||||
Args:
|
||||
model_params: List of model parameters created by :func:`prep_param_lists`.
|
||||
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. # noqa
|
||||
""" # noqa
|
||||
if flat_master:
|
||||
# The flattening may incur one more deep copy than is necessary.
|
||||
master_params[0].grad.data.copy_(
|
||||
_flatten_dense_tensors([p.grad.data for p in model_params]))
|
||||
else:
|
||||
for model, master in zip(model_params, master_params):
|
||||
if model.grad is not None:
|
||||
if master.grad is None:
|
||||
master.grad = Variable(
|
||||
master.data.new(*master.data.size()))
|
||||
master.grad.data.copy_(model.grad.data)
|
||||
else:
|
||||
master.grad = None
|
||||
|
||||
|
||||
def master_params_to_model_params(model_params,
|
||||
master_params,
|
||||
flat_master=False):
|
||||
"""
|
||||
Copy master parameters to model parameters.
|
||||
|
||||
Args:
|
||||
model_params: List of model parameters created by :func:`prep_param_lists`.
|
||||
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. # noqa
|
||||
""" # noqa
|
||||
if flat_master:
|
||||
for model, master in zip(
|
||||
model_params,
|
||||
_unflatten_dense_tensors(master_params[0].data, model_params)):
|
||||
model.data.copy_(master)
|
||||
else:
|
||||
for model, master in zip(model_params, master_params):
|
||||
model.data.copy_(master.data)
|
||||
|
||||
|
||||
# Backward compatibility fixes
|
||||
|
||||
|
||||
def to_python_float(t):
|
||||
if hasattr(t, 'item'):
|
||||
return t.item()
|
||||
else:
|
||||
return t[0]
|
||||
|
||||
|
||||
TORCH_MAJOR = int(torch.__version__.split('.')[0])
|
||||
TORCH_MINOR = int(torch.__version__.split('.')[1])
|
||||
|
||||
clip_grad_norm = mpu.clip_grad_norm
|
||||
245
modelscope/models/nlp/txl_poem/gpt2/fp16/loss_scaler.py
Executable file
245
modelscope/models/nlp/txl_poem/gpt2/fp16/loss_scaler.py
Executable file
@@ -0,0 +1,245 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
from .. import mpu
|
||||
|
||||
|
||||
# item() is a recent addition, so this helps with backward compatibility.
|
||||
def to_python_float(t):
|
||||
if hasattr(t, 'item'):
|
||||
return t.item()
|
||||
else:
|
||||
return t[0]
|
||||
|
||||
|
||||
class LossScaler:
|
||||
"""
|
||||
Class that manages a static loss scale. This class is intended to interact with
|
||||
:class:`FP16_Optimizer`, and should not be directly manipulated by the user.
|
||||
|
||||
Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
|
||||
:class:`FP16_Optimizer`'s constructor.
|
||||
|
||||
Args:
|
||||
scale (float, optional, default=1.0): The loss scale.
|
||||
"""
|
||||
|
||||
def __init__(self, scale=1):
|
||||
self.cur_scale = scale
|
||||
|
||||
# `params` is a list / generator of torch.Variable
|
||||
def has_overflow(self, params):
|
||||
return False
|
||||
|
||||
# `x` is a torch.Tensor
|
||||
def _has_inf_or_nan(x):
|
||||
return False
|
||||
|
||||
def update_scale(self, overflow):
|
||||
pass
|
||||
|
||||
@property
|
||||
def loss_scale(self):
|
||||
return self.cur_scale
|
||||
|
||||
def scale_gradient(self, module, grad_in, grad_out):
|
||||
return tuple(self.loss_scale * g for g in grad_in)
|
||||
|
||||
def backward(self, loss, retain_graph=False):
|
||||
scaled_loss = loss * self.loss_scale
|
||||
scaled_loss.backward(retain_graph=retain_graph)
|
||||
|
||||
|
||||
class DynamicLossScaler:
|
||||
"""
|
||||
Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler`
|
||||
indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
|
||||
:class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler`
|
||||
operates, because the default options can be changed using the
|
||||
the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
|
||||
|
||||
Loss scaling is designed to combat the problem of underflowing gradients encountered at long
|
||||
times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss
|
||||
scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are
|
||||
encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
|
||||
occurred.
|
||||
:class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
|
||||
and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
|
||||
If a certain number of iterations occur without overflowing gradients detected,
|
||||
:class:`DynamicLossScaler` increases the loss scale once more.
|
||||
In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
|
||||
always using the highest loss scale possible without incurring overflow.
|
||||
|
||||
Args:
|
||||
init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.`
|
||||
scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. # noqa
|
||||
scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. # noqa
|
||||
""" # noqa
|
||||
|
||||
def __init__(self,
|
||||
init_scale=2**32,
|
||||
scale_factor=2.,
|
||||
scale_window=1000,
|
||||
min_scale=1,
|
||||
delayed_shift=1,
|
||||
consecutive_hysteresis=False):
|
||||
self.cur_scale = init_scale
|
||||
self.cur_iter = 0
|
||||
self.last_overflow_iter = -1
|
||||
self.scale_factor = scale_factor
|
||||
self.scale_window = scale_window
|
||||
self.min_scale = min_scale
|
||||
self.delayed_shift = delayed_shift
|
||||
self.cur_hysteresis = delayed_shift
|
||||
self.consecutive_hysteresis = consecutive_hysteresis
|
||||
|
||||
# `params` is a list / generator of torch.Variable
|
||||
def has_overflow_serial(self, params):
|
||||
for p in params:
|
||||
if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
|
||||
p.grad.data):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def has_overflow(self, params):
|
||||
overflow = self.has_overflow_serial(params)
|
||||
# Since each model parallel GPU carries only part of the model,
|
||||
# make sure overflow flag is synced across all the model parallel GPUs
|
||||
overflow_gpu = torch.cuda.ByteTensor([overflow])
|
||||
torch.distributed.all_reduce(
|
||||
overflow_gpu,
|
||||
op=torch.distributed.ReduceOp.MAX,
|
||||
group=mpu.get_model_parallel_group())
|
||||
overflow = overflow_gpu[0].item()
|
||||
return bool(overflow)
|
||||
|
||||
# `x` is a torch.Tensor
|
||||
def _has_inf_or_nan(x):
|
||||
try:
|
||||
# if x is half, the .float() incurs an additional deep copy, but it's necessary if
|
||||
# Pytorch's .sum() creates a one-element tensor of the same type as x
|
||||
# (which is true for some recent version of pytorch).
|
||||
cpu_sum = float(x.float().sum())
|
||||
# More efficient version that can be used if .sum() returns a Python scalar
|
||||
# cpu_sum = float(x.sum())
|
||||
except RuntimeError as instance:
|
||||
# We want to check if inst is actually an overflow exception.
|
||||
# RuntimeError could come from a different error.
|
||||
# If so, we still want the exception to propagate.
|
||||
if 'value cannot be converted' not in instance.args[0]:
|
||||
raise
|
||||
return True
|
||||
else:
|
||||
if cpu_sum == float(
|
||||
'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
|
||||
return True
|
||||
return False
|
||||
|
||||
# `overflow` is boolean indicating whether the gradient overflowed
|
||||
def update_scale(self, overflow):
|
||||
|
||||
if not hasattr(self, 'min_scale'):
|
||||
self.min_scale = 1
|
||||
if not hasattr(self, 'delayed_shift'):
|
||||
self.delayed_shift = 1
|
||||
if not hasattr(self, 'cur_hysteresis'):
|
||||
self.cur_hysteresis = 1
|
||||
if not hasattr(self, 'consecutive_hysteresis'):
|
||||
self.consecutive_hysteresis = True
|
||||
if overflow:
|
||||
# self.cur_scale /= self.scale_factor
|
||||
if self.delayed_shift == 1 or self.cur_hysteresis == 1:
|
||||
self.cur_scale = max(self.cur_scale / self.scale_factor,
|
||||
self.min_scale)
|
||||
else:
|
||||
self.cur_hysteresis -= 1
|
||||
self.last_overflow_iter = self.cur_iter
|
||||
else:
|
||||
if self.consecutive_hysteresis:
|
||||
self.cur_hysteresis = self.delayed_shift
|
||||
if (self.cur_iter
|
||||
- self.last_overflow_iter) % self.scale_window == 0:
|
||||
if not self.consecutive_hysteresis:
|
||||
self.cur_hysteresis = self.delayed_shift
|
||||
self.cur_scale *= self.scale_factor
|
||||
self.cur_iter += 1
|
||||
|
||||
@property
|
||||
def loss_scale(self):
|
||||
return self.cur_scale
|
||||
|
||||
def scale_gradient(self, module, grad_in, grad_out):
|
||||
return tuple(self.loss_scale * g for g in grad_in)
|
||||
|
||||
def backward(self, loss, retain_graph=False):
|
||||
scaled_loss = loss * self.loss_scale
|
||||
scaled_loss.backward(retain_graph=retain_graph)
|
||||
|
||||
|
||||
##############################################################
|
||||
# Example usage below here -- assuming it's in a separate file
|
||||
##############################################################
|
||||
"""
|
||||
TO-DO separate out into an example.
|
||||
if __name__ == "__main__":
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
from dynamic_loss_scaler import DynamicLossScaler
|
||||
|
||||
# N is batch size; D_in is input dimension;
|
||||
# H is hidden dimension; D_out is output dimension.
|
||||
N, D_in, H, D_out = 64, 1000, 100, 10
|
||||
|
||||
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
|
||||
x = Variable(torch.randn(N, D_in), requires_grad=False)
|
||||
y = Variable(torch.randn(N, D_out), requires_grad=False)
|
||||
|
||||
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
|
||||
w2 = Variable(torch.randn(H, D_out), requires_grad=True)
|
||||
parameters = [w1, w2]
|
||||
|
||||
learning_rate = 1e-6
|
||||
optimizer = torch.optim.SGD(parameters, lr=learning_rate)
|
||||
loss_scaler = DynamicLossScaler()
|
||||
|
||||
for t in range(500):
|
||||
y_pred = x.mm(w1).clamp(min=0).mm(w2)
|
||||
loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
|
||||
print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
|
||||
print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
|
||||
print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
|
||||
|
||||
# Run backprop
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
# Check for overflow
|
||||
has_overflow = DynamicLossScaler.has_overflow(parameters)
|
||||
|
||||
# If no overflow, unscale grad and update as usual
|
||||
if not has_overflow:
|
||||
for param in parameters:
|
||||
param.grad.data.mul_(1. / loss_scaler.loss_scale)
|
||||
optimizer.step()
|
||||
# Otherwise, don't do anything -- ie, skip iteration
|
||||
else:
|
||||
print('OVERFLOW!')
|
||||
|
||||
# Update loss scale for next iteration
|
||||
loss_scaler.update_scale(has_overflow)
|
||||
|
||||
"""
|
||||
18
modelscope/models/nlp/txl_poem/gpt2/model/__init__.py
Executable file
18
modelscope/models/nlp/txl_poem/gpt2/model/__init__.py
Executable file
@@ -0,0 +1,18 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .distributed import * # noqa
|
||||
from .gpt2_modeling import (GPT2Model,
|
||||
gpt2_get_params_for_weight_decay_optimization)
|
||||
from .model import BertModel, get_params_for_weight_decay_optimization
|
||||
122
modelscope/models/nlp/txl_poem/gpt2/model/distributed.py
Executable file
122
modelscope/models/nlp/txl_poem/gpt2/model/distributed.py
Executable file
@@ -0,0 +1,122 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
from torch.autograd import Variable
|
||||
from torch.nn.modules import Module
|
||||
from torch.nn.parallel.distributed import DistributedDataParallel as DDP
|
||||
|
||||
from .. import mpu
|
||||
|
||||
|
||||
class PyTorchDistributedDataParallel(DDP):
|
||||
|
||||
def state_dict(self, destination=None, prefix='', keep_vars=False):
|
||||
sd = self.module.state_dict(destination, prefix, keep_vars)
|
||||
return sd
|
||||
|
||||
def load_state_dict(self, state_dict, strict=True):
|
||||
self.module.load_state_dict(state_dict, strict=strict)
|
||||
|
||||
|
||||
class DistributedDataParallel(Module):
|
||||
|
||||
def __init__(self, module):
|
||||
super(DistributedDataParallel, self).__init__()
|
||||
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
||||
|
||||
self.module = module
|
||||
self.data_parallel_group = mpu.get_data_parallel_group()
|
||||
src_rank = mpu.get_model_parallel_rank()
|
||||
for p in self.module.parameters():
|
||||
if torch.is_tensor(p):
|
||||
dist.broadcast(p, src_rank, group=self.data_parallel_group)
|
||||
|
||||
def allreduce_params(reduce_after=True,
|
||||
no_scale=False,
|
||||
fp32_allreduce=False):
|
||||
if (self.needs_reduction):
|
||||
self.needs_reduction = False
|
||||
buckets = {}
|
||||
for name, param in self.module.named_parameters():
|
||||
if param.requires_grad and param.grad is not None:
|
||||
tp = (param.data.type())
|
||||
if tp not in buckets:
|
||||
buckets[tp] = []
|
||||
buckets[tp].append(param)
|
||||
if self.warn_on_half:
|
||||
if torch.cuda.HalfTensor in buckets:
|
||||
print(
|
||||
'WARNING: gloo dist backend for half parameters may be extremely slow. It is recommended to use the NCCL backend in this case.' # noqa
|
||||
) # noqa
|
||||
self.warn_on_half = False
|
||||
for tp in buckets:
|
||||
bucket = buckets[tp]
|
||||
grads = [param.grad.data for param in bucket]
|
||||
coalesced = _flatten_dense_tensors(grads)
|
||||
if fp32_allreduce:
|
||||
coalesced = coalesced.float()
|
||||
if not no_scale and not reduce_after:
|
||||
coalesced /= dist.get_world_size(
|
||||
group=self.data_parallel_group)
|
||||
dist.all_reduce(coalesced, group=self.data_parallel_group)
|
||||
torch.cuda.synchronize()
|
||||
if not no_scale and reduce_after:
|
||||
coalesced /= dist.get_world_size(
|
||||
group=self.data_parallel_group)
|
||||
for buf, synced in zip(
|
||||
grads, _unflatten_dense_tensors(coalesced, grads)):
|
||||
buf.copy_(synced)
|
||||
|
||||
self.hook_handles = []
|
||||
self.hooks = []
|
||||
for param in list(self.module.parameters()):
|
||||
|
||||
def allreduce_hook(*unused):
|
||||
Variable._execution_engine.queue_callback(allreduce_params)
|
||||
|
||||
self.allreduce_params = allreduce_params
|
||||
|
||||
def forward(self, *inputs, **kwargs):
|
||||
self.needs_reduction = True
|
||||
return self.module(*inputs, **kwargs)
|
||||
|
||||
def state_dict(self, destination=None, prefix='', keep_vars=False):
|
||||
sd = self.module.state_dict(destination, prefix, keep_vars)
|
||||
|
||||
return sd
|
||||
|
||||
def load_state_dict(self, state_dict, strict=True):
|
||||
self.module.load_state_dict(state_dict, strict=strict)
|
||||
|
||||
'''
|
||||
def _sync_buffers(self):
|
||||
buffers = list(self.module._all_buffers())
|
||||
if len(buffers) > 0:
|
||||
# cross-node buffer sync
|
||||
flat_buffers = _flatten_dense_tensors(buffers)
|
||||
dist.broadcast(flat_buffers, 0)
|
||||
for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
|
||||
buf.copy_(synced)
|
||||
def train(self, mode=True):
|
||||
# Clear NCCL communicator and CUDA event cache of the default group ID,
|
||||
# These cache will be recreated at the later call. This is currently a
|
||||
# work-around for a potential NCCL deadlock.
|
||||
if dist._backend == dist.dist_backend.NCCL:
|
||||
dist._clear_group_cache()
|
||||
super(DistributedDataParallel, self).train(mode)
|
||||
self.module.train(mode)
|
||||
'''
|
||||
122
modelscope/models/nlp/txl_poem/gpt2/model/gpt2_modeling.py
Executable file
122
modelscope/models/nlp/txl_poem/gpt2/model/gpt2_modeling.py
Executable file
@@ -0,0 +1,122 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""GPT-2 model."""
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .. import mpu
|
||||
|
||||
|
||||
def init_method_normal(std=0.02):
|
||||
"""Init method based on normal distribution.
|
||||
|
||||
This is only used for embeddings. The transformer has its
|
||||
own initializer.
|
||||
"""
|
||||
|
||||
def init_(tensor):
|
||||
return torch.nn.init.normal_(tensor, mean=0.0, std=std)
|
||||
|
||||
return init_
|
||||
|
||||
|
||||
class GPT2Model(torch.nn.Module):
|
||||
"""GPT-2 Language model.
|
||||
|
||||
The output of the forward method are the logits (parallel or
|
||||
serial depending on the `parallel_output` flag.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
vocab_size,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
embedding_dropout_prob,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
max_sequence_length,
|
||||
max_memory_length,
|
||||
checkpoint_activations,
|
||||
checkpoint_num_layers=1,
|
||||
parallel_output=True,
|
||||
relative_encoding=False):
|
||||
|
||||
super(GPT2Model, self).__init__()
|
||||
|
||||
self.parallel_output = parallel_output
|
||||
|
||||
init_method = init_method_normal(std=0.02)
|
||||
|
||||
# Word embeddings (parallel).
|
||||
self.word_embeddings = mpu.VocabParallelEmbedding(
|
||||
vocab_size, hidden_size, init_method=init_method)
|
||||
|
||||
# Transformer
|
||||
self.transformer = mpu.GPT2ParallelTransformer(
|
||||
num_layers,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
max_sequence_length,
|
||||
max_memory_length,
|
||||
embedding_dropout_prob,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
checkpoint_activations,
|
||||
checkpoint_num_layers,
|
||||
relative_encoding=relative_encoding)
|
||||
|
||||
def forward(self, input_ids, position_ids, attention_mask, *mems):
|
||||
|
||||
# Embeddings.
|
||||
words_embeddings = self.word_embeddings(input_ids)
|
||||
embeddings = words_embeddings
|
||||
|
||||
# Transformer.
|
||||
transformer_output = self.transformer(embeddings, position_ids,
|
||||
attention_mask, *mems)
|
||||
logits, *hidden_layers = transformer_output
|
||||
# Parallel logits.
|
||||
logits_parallel = mpu.copy_to_model_parallel_region(logits)
|
||||
logits_parallel = F.linear(logits_parallel,
|
||||
self.word_embeddings.weight)
|
||||
|
||||
if self.parallel_output:
|
||||
return (logits_parallel, *hidden_layers)
|
||||
|
||||
return (mpu.gather_from_model_parallel_region(logits_parallel),
|
||||
*hidden_layers)
|
||||
|
||||
|
||||
def gpt2_get_params_for_weight_decay_optimization(module):
|
||||
|
||||
weight_decay_params = {'params': []}
|
||||
no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
|
||||
for module_ in module.modules():
|
||||
if isinstance(module_, (mpu.LayerNorm, torch.nn.LayerNorm)):
|
||||
no_weight_decay_params['params'].extend([
|
||||
p for p in list(module_._parameters.values()) if p is not None
|
||||
])
|
||||
else:
|
||||
weight_decay_params['params'].extend([
|
||||
p for n, p in list(module_._parameters.items())
|
||||
if p is not None and n != 'bias'
|
||||
])
|
||||
no_weight_decay_params['params'].extend([
|
||||
p for n, p in list(module_._parameters.items())
|
||||
if p is not None and n == 'bias'
|
||||
])
|
||||
|
||||
return weight_decay_params, no_weight_decay_params
|
||||
93
modelscope/models/nlp/txl_poem/gpt2/model/model.py
Executable file
93
modelscope/models/nlp/txl_poem/gpt2/model/model.py
Executable file
@@ -0,0 +1,93 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Utilities for wrapping BertModel."""
|
||||
|
||||
import torch
|
||||
|
||||
from .modeling import (BertConfig, BertForMaskedLM, BertForPreTraining,
|
||||
BertLayerNorm)
|
||||
|
||||
|
||||
def get_params_for_weight_decay_optimization(module):
|
||||
|
||||
weight_decay_params = {'params': []}
|
||||
no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
|
||||
for module_ in module.modules():
|
||||
if isinstance(module_, (BertLayerNorm, torch.nn.LayerNorm)):
|
||||
no_weight_decay_params['params'].extend([
|
||||
p for p in list(module_._parameters.values()) if p is not None
|
||||
])
|
||||
else:
|
||||
weight_decay_params['params'].extend([
|
||||
p for n, p in list(module_._parameters.items())
|
||||
if p is not None and n != 'bias'
|
||||
])
|
||||
no_weight_decay_params['params'].extend([
|
||||
p for n, p in list(module_._parameters.items())
|
||||
if p is not None and n == 'bias'
|
||||
])
|
||||
|
||||
return weight_decay_params, no_weight_decay_params
|
||||
|
||||
|
||||
class BertModel(torch.nn.Module):
|
||||
|
||||
def __init__(self, args):
|
||||
super(BertModel, self).__init__()
|
||||
if args.pretrained_bert:
|
||||
self.model = BertForPreTraining.from_pretrained(
|
||||
args.tokenizer_model_type,
|
||||
cache_dir=args.cache_dir,
|
||||
fp32_layernorm=args.fp32_layernorm,
|
||||
fp32_embedding=args.fp32_embedding,
|
||||
layernorm_epsilon=args.layernorm_epsilon)
|
||||
else:
|
||||
if args.intermediate_size is None:
|
||||
intermediate_size = 4 * args.hidden_size
|
||||
else:
|
||||
intermediate_size = args.intermediate_size
|
||||
self.config = BertConfig(
|
||||
args.tokenizer_num_tokens,
|
||||
hidden_size=args.hidden_size,
|
||||
num_hidden_layers=args.num_layers,
|
||||
num_attention_heads=args.num_attention_heads,
|
||||
intermediate_size=intermediate_size,
|
||||
hidden_dropout_prob=args.hidden_dropout,
|
||||
attention_probs_dropout_prob=args.attention_dropout,
|
||||
max_position_embeddings=args.max_position_embeddings,
|
||||
type_vocab_size=args.tokenizer_num_type_tokens,
|
||||
fp32_layernorm=args.fp32_layernorm,
|
||||
fp32_embedding=args.fp32_embedding,
|
||||
fp32_tokentypes=args.fp32_tokentypes,
|
||||
layernorm_epsilon=args.layernorm_epsilon,
|
||||
deep_init=args.deep_init)
|
||||
self.model = BertForPreTraining(self.config)
|
||||
|
||||
def forward(self,
|
||||
input_tokens,
|
||||
token_type_ids=None,
|
||||
attention_mask=None,
|
||||
checkpoint_activations=False):
|
||||
return self.model(
|
||||
input_tokens,
|
||||
token_type_ids,
|
||||
attention_mask,
|
||||
checkpoint_activations=checkpoint_activations)
|
||||
|
||||
def state_dict(self, destination=None, prefix='', keep_vars=False):
|
||||
return self.model.state_dict(
|
||||
destination=destination, prefix=prefix, keep_vars=keep_vars)
|
||||
|
||||
def load_state_dict(self, state_dict, strict=True):
|
||||
return self.model.load_state_dict(state_dict, strict=strict)
|
||||
1535
modelscope/models/nlp/txl_poem/gpt2/model/modeling.py
Executable file
1535
modelscope/models/nlp/txl_poem/gpt2/model/modeling.py
Executable file
File diff suppressed because it is too large
Load Diff
37
modelscope/models/nlp/txl_poem/gpt2/mpu/__init__.py
Executable file
37
modelscope/models/nlp/txl_poem/gpt2/mpu/__init__.py
Executable file
@@ -0,0 +1,37 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Model parallel utility interface."""
|
||||
|
||||
from .cross_entropy import vocab_parallel_cross_entropy
|
||||
from .data import broadcast_data
|
||||
from .grads import clip_grad_norm
|
||||
from .initialize import (destroy_model_parallel, get_data_parallel_group,
|
||||
get_data_parallel_rank, get_data_parallel_world_size,
|
||||
get_model_parallel_group, get_model_parallel_rank,
|
||||
get_model_parallel_src_rank,
|
||||
get_model_parallel_world_size,
|
||||
initialize_model_parallel,
|
||||
model_parallel_is_initialized)
|
||||
from .layers import (ColumnParallelLinear, ParallelEmbedding,
|
||||
RowParallelLinear, VocabParallelEmbedding)
|
||||
from .mappings import (copy_to_model_parallel_region,
|
||||
gather_from_model_parallel_region,
|
||||
reduce_from_model_parallel_region,
|
||||
scatter_to_model_parallel_region)
|
||||
from .random import (checkpoint, get_cuda_rng_tracker,
|
||||
model_parallel_cuda_manual_seed,
|
||||
partition_activations_in_checkpoint)
|
||||
from .transformer import (BertParallelSelfAttention,
|
||||
BertParallelTransformerLayer,
|
||||
GPT2ParallelTransformer, LayerNorm)
|
||||
823
modelscope/models/nlp/txl_poem/gpt2/mpu/bak_trans
Executable file
823
modelscope/models/nlp/txl_poem/gpt2/mpu/bak_trans
Executable file
@@ -0,0 +1,823 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Transformer."""
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn.init as init
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
|
||||
|
||||
from .initialize import get_model_parallel_world_size
|
||||
from .layers import ColumnParallelLinear
|
||||
from .layers import RowParallelLinear
|
||||
from .mappings import gather_from_model_parallel_region
|
||||
|
||||
import deepspeed
|
||||
|
||||
from .random import checkpoint
|
||||
from .random import get_cuda_rng_tracker
|
||||
|
||||
from .utils import divide
|
||||
from .utils import split_tensor_along_last_dim
|
||||
|
||||
|
||||
class PositionalEmbedding(torch.nn.Module):
|
||||
def __init__(self, hidden_size):
|
||||
super(PositionalEmbedding, self).__init__()
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size))
|
||||
self.register_buffer('inv_freq', inv_freq)
|
||||
|
||||
def forward(self, pos_seq, bsz=None):
|
||||
sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
|
||||
pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
|
||||
|
||||
if bsz is not None:
|
||||
return pos_emb[None, :, :].expand(bsz, -1, -1)
|
||||
else:
|
||||
return pos_emb[None, :, :]
|
||||
|
||||
|
||||
class GPT2ParallelSelfAttention(torch.nn.Module):
|
||||
"""Parallel self-attention layer for GPT2.
|
||||
|
||||
Self-attention layer takes input with size [b, s, h] where b is
|
||||
the batch size, s is the sequence lenght, and h is the hidden size
|
||||
and creates output of the same size.
|
||||
Arguments:
|
||||
hidden_size: total hidden size of the layer (h).
|
||||
num_attention_heads: number of attention heads (n). Note that we
|
||||
require n to be divisible by number of GPUs
|
||||
used to parallelize the model. Also, we
|
||||
require hidden size to be divisible by n.
|
||||
dropout_prob: dropout probability for the attention scores.
|
||||
init_method: weight initialization.
|
||||
output_layer_init_method: output layer initialization. If None, use
|
||||
`init_method`.
|
||||
We use the following notation:
|
||||
h: hidden_size
|
||||
n: num_attention_heads
|
||||
p: number of partitions
|
||||
np: n/p
|
||||
hp: h/p
|
||||
hn: h/n
|
||||
b: batch size
|
||||
s: sequence length
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, num_attention_heads,
|
||||
attention_dropout_prob, output_dropout_prob,
|
||||
init_method, output_layer_init_method=None, relative_encoding=False):
|
||||
super(GPT2ParallelSelfAttention, self).__init__()
|
||||
# Set output layer initialization if not provided.
|
||||
if output_layer_init_method is None:
|
||||
output_layer_init_method = init_method
|
||||
# Per attention head and per partition values.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.hidden_size_per_partition = divide(hidden_size, world_size)
|
||||
self.hidden_size_per_attention_head = divide(hidden_size,
|
||||
num_attention_heads)
|
||||
self.num_attention_heads_per_partition = divide(num_attention_heads,
|
||||
world_size)
|
||||
self.relative_encoding = relative_encoding
|
||||
# Strided linear layer.
|
||||
self.query_key_value = ColumnParallelLinear(hidden_size, 3 * hidden_size,
|
||||
stride=3,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
if relative_encoding:
|
||||
self.relative = ColumnParallelLinear(hidden_size, hidden_size, gather_output=False,
|
||||
init_method=init_method)
|
||||
# Dropout. Note that for a single iteration, this layer will generate
|
||||
# different outputs on different number of parallel partitions but
|
||||
# on average it should not be partition dependent.
|
||||
self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
|
||||
|
||||
# Output.
|
||||
self.dense = RowParallelLinear(hidden_size,
|
||||
hidden_size,
|
||||
input_is_parallel=True,
|
||||
init_method=output_layer_init_method)
|
||||
self.output_dropout = torch.nn.Dropout(output_dropout_prob)
|
||||
|
||||
if deepspeed.checkpointing.is_configured():
|
||||
global get_cuda_rng_tracker, checkpoint
|
||||
get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
|
||||
checkpoint = deepspeed.checkpointing.checkpoint
|
||||
|
||||
def _transpose_for_scores(self, tensor):
|
||||
"""Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
|
||||
size [b, np, s, hn].
|
||||
"""
|
||||
new_tensor_shape = tensor.size()[:-1] + \
|
||||
(self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head)
|
||||
tensor = tensor.view(*new_tensor_shape)
|
||||
return tensor.permute(0, 2, 1, 3)
|
||||
|
||||
@staticmethod
|
||||
def _rel_shift(x, zero_triu=False):
|
||||
# ql x kl x bsz x h
|
||||
# bsz x h x ql x kl
|
||||
zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1),
|
||||
device=x.device, dtype=x.dtype)
|
||||
x_padded = torch.cat([zero_pad, x], dim=-1)
|
||||
|
||||
x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2))
|
||||
|
||||
x = x_padded[:, :, 1:].view_as(x)
|
||||
|
||||
if zero_triu:
|
||||
ones = torch.ones((x.size(0), x.size(1)))
|
||||
x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None]
|
||||
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def _rel_shift_latest(x: torch.Tensor):
|
||||
ndims = x.dim()
|
||||
x_shape = x.size()
|
||||
row_dim = 2
|
||||
col_dim = row_dim + 1
|
||||
assert col_dim < ndims
|
||||
tgt_shape_1, tgt_shape_2 = [], []
|
||||
for i in range(ndims):
|
||||
if i == row_dim:
|
||||
tgt_shape_1.append(x_shape[col_dim])
|
||||
tgt_shape_2.append(x_shape[row_dim])
|
||||
elif i == col_dim:
|
||||
tgt_shape_1.append(x_shape[row_dim])
|
||||
tgt_shape_2.append(x_shape[col_dim] - 1)
|
||||
else:
|
||||
tgt_shape_1.append(x_shape[i])
|
||||
tgt_shape_2.append(x_shape[i])
|
||||
x = x.view(*tgt_shape_1)
|
||||
x = x[:, :, 1:, :]
|
||||
x = x.view(*tgt_shape_2)
|
||||
return x
|
||||
|
||||
def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None):
|
||||
# hidden_states: [b, s, h]
|
||||
# ltor_mask: [1, 1, s, s]
|
||||
|
||||
# Attention heads. [b, s, hp]
|
||||
query_length = hidden_states.size(1)
|
||||
|
||||
if mem is None:
|
||||
mixed_x_layer = self.query_key_value(hidden_states)
|
||||
(mixed_query_layer,
|
||||
mixed_key_layer,
|
||||
mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
|
||||
else:
|
||||
cat = torch.cat((mem, hidden_states), 1)
|
||||
mixed_x_layer = self.query_key_value(cat)
|
||||
(mixed_query_layer,
|
||||
mixed_key_layer,
|
||||
mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
|
||||
mixed_query_layer = mixed_query_layer[:, -query_length:]
|
||||
|
||||
# Reshape and transpose [b, np, s, hn]
|
||||
query_layer = self._transpose_for_scores(mixed_query_layer)
|
||||
key_layer = self._transpose_for_scores(mixed_key_layer)
|
||||
value_layer = self._transpose_for_scores(mixed_value_layer)
|
||||
if self.relative_encoding:
|
||||
relative_layer = self.relative(position_embeddings)
|
||||
relative_layer = self._transpose_for_scores(relative_layer) # 1 (bsz) x n_head x klen x d_head
|
||||
# Raw attention scores. [b, np, qs, ks]
|
||||
rw_head_q = query_layer + r_w_bias.unsqueeze(1)
|
||||
ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2))
|
||||
rr_head_q = query_layer + r_r_bias.unsqueeze(1)
|
||||
bd_score = torch.matmul(rr_head_q, relative_layer.transpose(-1, -2))
|
||||
bd_score = self._rel_shift(bd_score) # qlen x klen x bsz x n_head
|
||||
# bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen
|
||||
|
||||
attention_scores = ac_score + bd_score
|
||||
else:
|
||||
# Raw attention scores. [b, np, s, s]
|
||||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||
attention_scores = attention_scores / math.sqrt(
|
||||
self.hidden_size_per_attention_head)
|
||||
# Apply the left to right attention mask.
|
||||
attention_scores = torch.mul(attention_scores, ltor_mask) - \
|
||||
10000.0 * (1.0 - ltor_mask)
|
||||
|
||||
# Attention probabilities. [b, np, s, s]
|
||||
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
with get_cuda_rng_tracker().fork():
|
||||
attention_probs = self.attention_dropout(attention_probs)
|
||||
|
||||
# Context layer.
|
||||
# [b, np, s, hn]
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
# [b, s, np, hn]
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + \
|
||||
(self.hidden_size_per_partition,)
|
||||
# [b, s, hp]
|
||||
context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
# Output. [b, s, h]
|
||||
output = self.dense(context_layer)
|
||||
output = self.output_dropout(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def gelu_impl(x):
|
||||
"""OpenAI's gelu implementation."""
|
||||
return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
|
||||
(1.0 + 0.044715 * x * x)))
|
||||
|
||||
|
||||
def gelu(x):
|
||||
return gelu_impl(x)
|
||||
|
||||
|
||||
class GPT2ParallelMLP(torch.nn.Module):
|
||||
"""MLP for GPT2.
|
||||
|
||||
MLP will take the input with h hidden state, project it to 4*h
|
||||
hidden dimension, perform gelu transformation, and project the
|
||||
state back into h hidden dimension. At the end, dropout is also
|
||||
applied.
|
||||
|
||||
Arguments:
|
||||
hidden_size: The hidden size of the self attention.
|
||||
output_dropout_prob: dropout probability for the outputs
|
||||
after self attention and final output.
|
||||
init_method: initialization method used for the weights. Note
|
||||
that all biases are initialized to zero and
|
||||
layernorm weight are initialized to one.
|
||||
output_layer_init_method: output layer initialization. If None,
|
||||
use `init_method`.
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, output_dropout_prob, init_method,
|
||||
output_layer_init_method=None):
|
||||
super(GPT2ParallelMLP, self).__init__()
|
||||
# Set output layer initialization if not provided.
|
||||
if output_layer_init_method is None:
|
||||
output_layer_init_method = init_method
|
||||
# Project to 4h.
|
||||
self.dense_h_to_4h = ColumnParallelLinear(hidden_size, 4 * hidden_size,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
# Project back to h.
|
||||
self.dense_4h_to_h = RowParallelLinear(
|
||||
4 * hidden_size,
|
||||
hidden_size,
|
||||
input_is_parallel=True,
|
||||
init_method=output_layer_init_method)
|
||||
self.dropout = torch.nn.Dropout(output_dropout_prob)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
# [b, s, 4hp]
|
||||
intermediate_parallel = self.dense_h_to_4h(hidden_states)
|
||||
intermediate_parallel = gelu(intermediate_parallel)
|
||||
|
||||
# [b, s, h]
|
||||
output = self.dense_4h_to_h(intermediate_parallel)
|
||||
output = self.dropout(output)
|
||||
return output
|
||||
|
||||
|
||||
class GPT2ParallelTransformerLayer(torch.nn.Module):
|
||||
"""A single layer transformer for GPT2.
|
||||
|
||||
We use the following notation:
|
||||
h: hidden size
|
||||
n: number of attention heads
|
||||
b: batch size
|
||||
s: sequence length
|
||||
Transformore layer takes input with size [b, s, h] and returns an
|
||||
output of the same size.
|
||||
|
||||
Arguments:
|
||||
hidden_size: The hidden size of the self attention.
|
||||
num_attention_heads: number of attention head in the self
|
||||
attention.
|
||||
attention_dropout_prob: dropout probability of the attention
|
||||
score in self attention.
|
||||
output_dropout_prob: dropout probability for the outputs
|
||||
after self attention and final output.
|
||||
layernorm_epsilon: epsilon used in layernorm to avoid
|
||||
division by zero.
|
||||
init_method: initialization method used for the weights. Note
|
||||
that all biases are initialized to zero and
|
||||
layernorm weight are initialized to one.
|
||||
output_layer_init_method: output layers (attention output and
|
||||
mlp output) initialization. If None,
|
||||
use `init_method`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
layernorm_epsilon,
|
||||
init_method,
|
||||
output_layer_init_method=None,
|
||||
relative_encoding=False):
|
||||
super(GPT2ParallelTransformerLayer, self).__init__()
|
||||
# Set output layer initialization if not provided.
|
||||
if output_layer_init_method is None:
|
||||
output_layer_init_method = init_method
|
||||
|
||||
# Layernorm on the input data.
|
||||
self.input_layernorm = torch.nn.LayerNorm(hidden_size, eps=layernorm_epsilon)
|
||||
|
||||
# Self attention.
|
||||
self.attention = GPT2ParallelSelfAttention(
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
init_method,
|
||||
output_layer_init_method=output_layer_init_method,
|
||||
relative_encoding=relative_encoding)
|
||||
|
||||
# Layernorm on the input data.
|
||||
self.post_attention_layernorm = LayerNorm(hidden_size,
|
||||
eps=layernorm_epsilon)
|
||||
|
||||
# MLP
|
||||
self.mlp = GPT2ParallelMLP(
|
||||
hidden_size,
|
||||
output_dropout_prob,
|
||||
init_method,
|
||||
output_layer_init_method=output_layer_init_method)
|
||||
|
||||
def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None):
|
||||
# hidden_states: [b, s, h]
|
||||
# ltor_mask: [1, 1, s, s]
|
||||
|
||||
# Layer norm at the begining of the transformer layer.
|
||||
layernorm_output = self.input_layernorm(hidden_states)
|
||||
mem = self.input_layernorm(mem) if mem is not None else None
|
||||
# Self attention.
|
||||
attention_output = self.attention(layernorm_output, ltor_mask, position_embeddings, r_w_bias, r_r_bias, mem)
|
||||
# Residual connection.
|
||||
layernorm_input = hidden_states + attention_output
|
||||
# Layer norm post the self attention.
|
||||
layernorm_output = self.post_attention_layernorm(layernorm_input)
|
||||
# MLP.
|
||||
mlp_output = self.mlp(layernorm_output)
|
||||
# Second residual connection.
|
||||
output = layernorm_input + mlp_output
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def unscaled_init_method(sigma):
|
||||
"""Init method based on N(0, sigma)."""
|
||||
|
||||
def init_(tensor):
|
||||
return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
|
||||
|
||||
return init_
|
||||
|
||||
|
||||
def scaled_init_method(sigma, num_layers):
|
||||
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
|
||||
std = sigma / math.sqrt(2.0 * num_layers)
|
||||
|
||||
def init_(tensor):
|
||||
return torch.nn.init.normal_(tensor, mean=0.0, std=std)
|
||||
|
||||
return init_
|
||||
|
||||
|
||||
class GPT2ParallelTransformer(torch.nn.Module):
|
||||
"""GPT-2 transformer.
|
||||
|
||||
This module takes input from embedding layer and it's output can
|
||||
be used directly by a logit layer. It consists of L (num-layers)
|
||||
blocks of:
|
||||
layer norm
|
||||
self attention
|
||||
residual connection
|
||||
layer norm
|
||||
mlp
|
||||
residual connection
|
||||
followed by a final layer norm.
|
||||
|
||||
Arguments:
|
||||
num_layers: Number of transformer layers.
|
||||
hidden_size: The hidden size of the self attention.
|
||||
num_attention_heads: number of attention head in the self
|
||||
attention.
|
||||
attention_dropout_prob: dropout probability of the attention
|
||||
score in self attention.
|
||||
output_dropout_prob: dropout probability for the outputs
|
||||
after self attention and final output.
|
||||
checkpoint_activations: if True, checkpoint activations.
|
||||
checkpoint_num_layers: number of layers to checkpoint. This
|
||||
is basically the chunk size in checkpoitning.
|
||||
layernorm_epsilon: epsilon used in layernorm to avoid
|
||||
division by zero.
|
||||
init_method_std: standard deviation of the init method which has
|
||||
the form N(0, std).
|
||||
use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
|
||||
scaling for the output weights (
|
||||
output of self attention and mlp).
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
max_sequence_length,
|
||||
max_memory_length,
|
||||
embedding_dropout_prob,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
checkpoint_activations,
|
||||
checkpoint_num_layers=1,
|
||||
layernorm_epsilon=1.0e-5,
|
||||
init_method_std=0.02,
|
||||
use_scaled_init_for_output_weights=True,
|
||||
relative_encoding=False):
|
||||
super(GPT2ParallelTransformer, self).__init__()
|
||||
# Store activation checkpoiting flag.
|
||||
self.checkpoint_activations = checkpoint_activations
|
||||
self.checkpoint_num_layers = checkpoint_num_layers
|
||||
self.max_memory_length = max_memory_length
|
||||
|
||||
output_layer_init_method = None
|
||||
if use_scaled_init_for_output_weights:
|
||||
output_layer_init_method = scaled_init_method(init_method_std,
|
||||
num_layers)
|
||||
# Embeddings dropout
|
||||
self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
|
||||
self.relative_encoding = relative_encoding
|
||||
if relative_encoding:
|
||||
# Relative position embedding
|
||||
self.position_embeddings = PositionalEmbedding(hidden_size)
|
||||
# Per attention head and per partition values.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.hidden_size_per_attention_head = divide(hidden_size,
|
||||
num_attention_heads)
|
||||
self.num_attention_heads_per_partition = divide(num_attention_heads,
|
||||
world_size)
|
||||
self.r_w_bias = torch.nn.Parameter(
|
||||
torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
|
||||
self.r_w_bias.model_parallel = True
|
||||
self.r_r_bias = torch.nn.Parameter(
|
||||
torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
|
||||
self.r_r_bias.model_parallel = True
|
||||
# Always initialize bias to zero.
|
||||
with torch.no_grad():
|
||||
self.r_w_bias.zero_()
|
||||
self.r_r_bias.zero_()
|
||||
else:
|
||||
# Position embedding (serial).
|
||||
self.position_embeddings = torch.nn.Embedding(max_sequence_length,
|
||||
hidden_size)
|
||||
# Initialize the position embeddings.
|
||||
torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
|
||||
|
||||
def get_layer():
|
||||
return GPT2ParallelTransformerLayer(
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
layernorm_epsilon,
|
||||
unscaled_init_method(init_method_std),
|
||||
output_layer_init_method=output_layer_init_method,
|
||||
relative_encoding=relative_encoding)
|
||||
|
||||
# Transformer layers.
|
||||
self.layers = torch.nn.ModuleList(
|
||||
[get_layer() for _ in range(num_layers)])
|
||||
|
||||
# Final layer norm before output.
|
||||
self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
|
||||
|
||||
if deepspeed.checkpointing.is_configured():
|
||||
global get_cuda_rng_tracker, checkpoint
|
||||
get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
|
||||
checkpoint = deepspeed.checkpointing.checkpoint
|
||||
|
||||
def forward(self, hidden_states, position_ids, attention_mask, *mems):
|
||||
batch_size, query_length = hidden_states.size()[:2]
|
||||
memory_length = mems[0].size(1) if mems else 0
|
||||
key_length = query_length + memory_length
|
||||
attention_mask = attention_mask[:, :, :, -query_length - memory_length:]
|
||||
if self.relative_encoding:
|
||||
hidden_states = self.embedding_dropout(hidden_states)
|
||||
position_sequence = torch.arange(key_length - 1, -1, -1.0, device=hidden_states.device,
|
||||
dtype=hidden_states.dtype)
|
||||
position_embeddings = self.position_embeddings(position_sequence)
|
||||
# Apply dropout
|
||||
position_embeddings = self.embedding_dropout(position_embeddings)
|
||||
hidden_states = self.embedding_dropout(hidden_states)
|
||||
else:
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
hidden_states = hidden_states + position_embeddings
|
||||
hidden_states = self.embedding_dropout(hidden_states)
|
||||
if self.max_memory_length > 0:
|
||||
mem_layers = [hidden_states.detach()]
|
||||
else:
|
||||
mem_layers = []
|
||||
|
||||
def custom(start, end):
|
||||
def custom_forward(*inputs):
|
||||
layers_ = self.layers[start:end]
|
||||
x_, inputs = inputs[0], inputs[1:]
|
||||
if self.relative_encoding:
|
||||
inputs, mems_ = inputs[:4], inputs[4:]
|
||||
else:
|
||||
inputs, mems_ = inputs[:1], inputs[1:]
|
||||
for i, layer in enumerate(layers_):
|
||||
mem_i_ = mems_[i] if mems_ else None
|
||||
x_ = layer(x_, *inputs, mem=mem_i_)
|
||||
if self.max_memory_length > 0:
|
||||
mem_layers.append(x_.detach())
|
||||
return x_
|
||||
|
||||
return custom_forward
|
||||
|
||||
if self.checkpoint_activations:
|
||||
l = 0
|
||||
num_layers = len(self.layers)
|
||||
chunk_length = self.checkpoint_num_layers
|
||||
while l < num_layers:
|
||||
args = [hidden_states, attention_mask]
|
||||
if self.relative_encoding:
|
||||
args += [position_embeddings, self.r_w_bias, self.r_r_bias]
|
||||
if mems:
|
||||
args += mems[l: l + chunk_length]
|
||||
hidden_states = checkpoint(custom(l, l + chunk_length), *args)
|
||||
l += chunk_length
|
||||
else:
|
||||
for i, layer in enumerate(self.layers):
|
||||
args = [hidden_states, attention_mask]
|
||||
if self.relative_encoding:
|
||||
args += [position_embeddings, self.r_w_bias, self.r_r_bias]
|
||||
mem_i = mems[i] if mems else None
|
||||
hidden_states = layer(*args, mem=mem_i)
|
||||
if self.max_memory_length > 0:
|
||||
mem_layers.append(hidden_states.detach())
|
||||
|
||||
# Final layer norm.
|
||||
output = self.final_layernorm(hidden_states)
|
||||
if self.max_memory_length > 0:
|
||||
mem_layers = self.update_mems(mem_layers, mems)
|
||||
|
||||
return (output, *mem_layers)
|
||||
|
||||
def update_mems(self, hiddens, mems):
|
||||
memory_length = mems[0].size(1) if mems else 0
|
||||
query_length = hiddens[0].size(1)
|
||||
new_memory_length = min(self.max_memory_length, memory_length + query_length)
|
||||
new_mems = []
|
||||
with torch.no_grad():
|
||||
for i in range(len(hiddens)):
|
||||
if new_memory_length <= query_length:
|
||||
new_mems.append(hiddens[i][:, -new_memory_length:])
|
||||
else:
|
||||
new_mems.append(torch.cat((mems[i][:, -new_memory_length + query_length:], hiddens[i]), dim=1))
|
||||
return new_mems
|
||||
|
||||
|
||||
class BertParallelSelfAttention(torch.nn.Module):
|
||||
"""Parallel self-attention layer for BERT.
|
||||
|
||||
Self-attention layer takes input with size [b, s, h] where b is
|
||||
the batch size, s is the sequence lenght, and h is the hidden size
|
||||
and creates output of the same size.
|
||||
Arguments:
|
||||
hidden_size: total hidden size of the layer (h).
|
||||
num_attention_heads: number of attention heads (n). Note that we
|
||||
require n to be divisible by number of GPUs
|
||||
used to parallelize the model. Also, we
|
||||
require hidden size be divisible by n.
|
||||
dropout_prob: dropout probability for the attention scores.
|
||||
output_parallel: If true, no all-gather is done on the output and
|
||||
the output values will be per partition.
|
||||
We use the following notation:
|
||||
h: hidden_size
|
||||
n: num_attention_heads
|
||||
p: number of partitions
|
||||
np: n/p
|
||||
hp: h/p
|
||||
hn: h/n
|
||||
b: batch size
|
||||
s: sequence length
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_size, num_attention_heads,
|
||||
dropout_prob, output_parallel=False,
|
||||
init_method=init.xavier_normal_):
|
||||
super(BertParallelSelfAttention, self).__init__()
|
||||
# Input configuration.
|
||||
self.hidden_size = hidden_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.dropout_prob = dropout_prob
|
||||
self.output_parallel = output_parallel
|
||||
# Per attention head and per partition values.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.hidden_size_per_partition = divide(hidden_size, world_size)
|
||||
self.hidden_size_per_attention_head = divide(hidden_size,
|
||||
num_attention_heads)
|
||||
self.num_attention_heads_per_partition = divide(num_attention_heads,
|
||||
world_size)
|
||||
# Strided linear layer.
|
||||
self.query_key_value = ColumnParallelLinear(hidden_size, 3 * hidden_size,
|
||||
stride=3,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
# Dropout. Note that for a single iteration, this layer will generate
|
||||
# different outputs on different number of parallel partitions but
|
||||
# on average it should not be partition dependent.
|
||||
self.dropout = torch.nn.Dropout(dropout_prob)
|
||||
|
||||
if deepspeed.checkpointing.is_configured():
|
||||
global get_cuda_rng_tracker, checkpoint
|
||||
get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
|
||||
checkpoint = deepspeed.checkpointing.checkpoint
|
||||
|
||||
def _transpose_for_scores(self, tensor):
|
||||
"""Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
|
||||
size [b, np, s, hn].
|
||||
"""
|
||||
new_tensor_shape = tensor.size()[:-1] + \
|
||||
(self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head)
|
||||
tensor = tensor.view(*new_tensor_shape)
|
||||
return tensor.permute(0, 2, 1, 3)
|
||||
|
||||
def forward(self, hidden_states, attention_mask):
|
||||
|
||||
# Attention heads. [b, s, hp]
|
||||
mixed_x_layer = self.query_key_value(hidden_states)
|
||||
(mixed_query_layer,
|
||||
mixed_key_layer,
|
||||
mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
|
||||
|
||||
# Reshape and transpose [b, np, s, hn]
|
||||
query_layer = self._transpose_for_scores(mixed_query_layer)
|
||||
key_layer = self._transpose_for_scores(mixed_key_layer)
|
||||
value_layer = self._transpose_for_scores(mixed_value_layer)
|
||||
|
||||
# Raw attention scores. [b, np, s, s]
|
||||
norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
|
||||
attention_scores = torch.matmul(query_layer / norm_factor,
|
||||
key_layer.transpose(-1, -2) / norm_factor)
|
||||
# Apply the attention mask.
|
||||
attention_scores += attention_mask
|
||||
|
||||
# Attention probabilities. [b, np, s, s]
|
||||
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
with get_cuda_rng_tracker().fork():
|
||||
attention_probs = self.dropout(attention_probs)
|
||||
|
||||
# Context layer.
|
||||
# [b, np, s, hn]
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
# [b, s, np, hn]
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + \
|
||||
(self.hidden_size_per_partition,)
|
||||
# [b, s, hp]
|
||||
context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
# Output. [b, s, h]
|
||||
if self.output_parallel:
|
||||
output = context_layer
|
||||
else:
|
||||
output = gather_from_model_parallel_region(context_layer)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class BertParallelTransformerOutput(torch.nn.Module):
|
||||
"""The output layer used after self attention and intermediate
|
||||
parts of transformer layer."""
|
||||
|
||||
def __init__(self, input_size, output_size, dropout_prob,
|
||||
layernorm_epsilon=1.0e-12, input_is_parallel=False,
|
||||
init_method=init.xavier_normal_):
|
||||
super(BertParallelTransformerOutput, self).__init__()
|
||||
# Components.
|
||||
self.dense = RowParallelLinear(input_size,
|
||||
output_size,
|
||||
input_is_parallel=input_is_parallel,
|
||||
init_method=init_method)
|
||||
self.dropout = torch.nn.Dropout(dropout_prob)
|
||||
self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon)
|
||||
|
||||
def forward(self, hidden_states, input_tensor):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
layernorm_input = hidden_states + input_tensor
|
||||
hidden_states = self.layernorm(layernorm_input)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BertParallelTransformerLayer(torch.nn.Module):
|
||||
"""A single layer transformer for Bert.
|
||||
|
||||
We use the following notation:
|
||||
h: hidden size
|
||||
n: number of attention heads
|
||||
b: batch size
|
||||
s: sequence length
|
||||
Transformore layer takes input with size [b, s, h] and returns an
|
||||
output of the same size.
|
||||
|
||||
Arguments:
|
||||
hidden_size: The hidden size of the self attention.
|
||||
intermediate_size: size of the intermediate state after
|
||||
self attention. In both BERT and GPT
|
||||
this is set to be 4 times the hidden
|
||||
size.
|
||||
num_attention_heads: number of attention head in the self
|
||||
attention.
|
||||
attention_dropout_prob: dropout probability of the attention
|
||||
score in self attention.
|
||||
output_dropout_prob: dropout probability for the outputs
|
||||
after self attention and final output.
|
||||
intermediate_activation_fn: activation function for output
|
||||
of intermediate.
|
||||
layernorm_epsilon: epsilon used in layernorm to avoid
|
||||
division by zero.
|
||||
init_method: initialization method used for the weights. Note
|
||||
that all biases are initialized to zero and
|
||||
layernorm weight are initialized to one.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
intermediate_activation_fn,
|
||||
layernorm_epsilon,
|
||||
init_method=init.xavier_normal_):
|
||||
super(BertParallelTransformerLayer, self).__init__()
|
||||
|
||||
# Self attention.
|
||||
self.attention = BertParallelSelfAttention(hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_parallel=True,
|
||||
init_method=init_method)
|
||||
# Self attention output.
|
||||
self.self_output = BertParallelTransformerOutput(
|
||||
hidden_size, hidden_size, output_dropout_prob,
|
||||
layernorm_epsilon=layernorm_epsilon,
|
||||
input_is_parallel=True,
|
||||
init_method=init_method)
|
||||
# Intermediate.
|
||||
self.intermediate = ColumnParallelLinear(hidden_size, intermediate_size,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
self.intermediate_activation_fn = intermediate_activation_fn
|
||||
# Output.
|
||||
self.output = BertParallelTransformerOutput(
|
||||
intermediate_size, hidden_size, output_dropout_prob,
|
||||
layernorm_epsilon=layernorm_epsilon,
|
||||
input_is_parallel=True,
|
||||
init_method=init_method)
|
||||
|
||||
def forward(self, hidden_states, attention_mask):
|
||||
# [b, s, hp]
|
||||
attention_output_parallel = self.attention(hidden_states,
|
||||
attention_mask)
|
||||
# [b, s, h]
|
||||
attention_self_output = self.self_output(attention_output_parallel,
|
||||
hidden_states)
|
||||
# [b, s, ip]
|
||||
intermediate_output_parallel = self.intermediate(attention_self_output)
|
||||
intermediate_output_parallel = self.intermediate_activation_fn(
|
||||
intermediate_output_parallel)
|
||||
# [b, s, h]
|
||||
layer_output = self.output(intermediate_output_parallel,
|
||||
attention_self_output)
|
||||
|
||||
return layer_output
|
||||
110
modelscope/models/nlp/txl_poem/gpt2/mpu/cross_entropy.py
Executable file
110
modelscope/models/nlp/txl_poem/gpt2/mpu/cross_entropy.py
Executable file
@@ -0,0 +1,110 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
from .initialize import (get_model_parallel_group, get_model_parallel_rank,
|
||||
get_model_parallel_world_size)
|
||||
from .utils import VocabUtility
|
||||
|
||||
|
||||
class _VocabParallelCrossEntropy(torch.autograd.Function):
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, vocab_parallel_logits, target):
|
||||
|
||||
# Copy so the input remains unchanged.
|
||||
logits = vocab_parallel_logits.clone()
|
||||
# Maximum value along vocab dimension across all GPUs.
|
||||
logits_max = torch.max(logits, dim=-1)[0]
|
||||
torch.distributed.all_reduce(
|
||||
logits_max,
|
||||
op=torch.distributed.ReduceOp.MAX,
|
||||
group=get_model_parallel_group())
|
||||
# Subtract the maximum value.
|
||||
logits.sub_(logits_max.unsqueeze(dim=-1))
|
||||
# Sum of exponential of logits along vocab dimension across all GPUs.
|
||||
exp_logits = logits.exp()
|
||||
sum_exp_logits = exp_logits.sum(dim=-1)
|
||||
torch.distributed.all_reduce(
|
||||
sum_exp_logits,
|
||||
op=torch.distributed.ReduceOp.SUM,
|
||||
group=get_model_parallel_group())
|
||||
|
||||
# Get the partition's vocab indecies
|
||||
get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
|
||||
partition_vocab_size = vocab_parallel_logits.size()[-1]
|
||||
rank = get_model_parallel_rank()
|
||||
world_size = get_model_parallel_world_size()
|
||||
vocab_start_index, vocab_end_index = get_vocab_range(
|
||||
partition_vocab_size, rank, world_size)
|
||||
|
||||
# Create a mask of valid vocab ids (1 means it needs to be masked).
|
||||
target_mask = (target < vocab_start_index) | (
|
||||
target >= vocab_end_index)
|
||||
masked_target = target.clone() - vocab_start_index
|
||||
masked_target[target_mask] = 0
|
||||
|
||||
# Get predicted-logits = logits[target].
|
||||
# For Simplicity, we convert logits to a 2-D tensor with size
|
||||
# [*, partition-vocab-size] and target to a 1-D tensor of size [*].
|
||||
logits_2d = logits.view(-1, partition_vocab_size)
|
||||
masked_target_1d = masked_target.view(-1)
|
||||
arange_1d = torch.arange(
|
||||
start=0, end=logits_2d.size()[0], device=logits_2d.device)
|
||||
predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
|
||||
predicted_logits = predicted_logits_1d.view_as(target)
|
||||
predicted_logits[target_mask] = 0.0
|
||||
# All reduce is needed to get the chunks from other GPUs.
|
||||
torch.distributed.all_reduce(
|
||||
predicted_logits,
|
||||
op=torch.distributed.ReduceOp.SUM,
|
||||
group=get_model_parallel_group())
|
||||
|
||||
# Loss = log(sum(exp(logits))) - predicted-logit.
|
||||
loss = torch.log(sum_exp_logits) - predicted_logits
|
||||
|
||||
# Store softmax, target-mask and masked-target for backward pass.
|
||||
exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
|
||||
ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
|
||||
|
||||
return loss
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
|
||||
# Retreive tensors from the forward path.
|
||||
softmax, target_mask, masked_target_1d = ctx.saved_tensors
|
||||
|
||||
# All the inputs have softmax as thier gradient.
|
||||
grad_input = softmax
|
||||
# For simplicity, work with the 2D gradient.
|
||||
partition_vocab_size = softmax.size()[-1]
|
||||
grad_2d = grad_input.view(-1, partition_vocab_size)
|
||||
|
||||
# Add the gradient from matching classes.
|
||||
arange_1d = torch.arange(
|
||||
start=0, end=grad_2d.size()[0], device=grad_2d.device)
|
||||
grad_2d[arange_1d,
|
||||
masked_target_1d] -= (1.0 - target_mask.view(-1).float())
|
||||
|
||||
# Finally elementwise multiplication with the output gradients.
|
||||
grad_input.mul_(grad_output.unsqueeze(dim=-1))
|
||||
|
||||
return grad_input, None
|
||||
|
||||
|
||||
def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
|
||||
"""Helper function for the cross entropy."""
|
||||
return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
|
||||
117
modelscope/models/nlp/txl_poem/gpt2/mpu/data.py
Executable file
117
modelscope/models/nlp/txl_poem/gpt2/mpu/data.py
Executable file
@@ -0,0 +1,117 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
from .initialize import (get_model_parallel_group, get_model_parallel_rank,
|
||||
get_model_parallel_src_rank)
|
||||
|
||||
_MAX_DATA_DIM = 4
|
||||
|
||||
|
||||
def _check_data_types(keys, data, target_dtype):
|
||||
"""Check that all the keys have the same target data type."""
|
||||
for key in keys:
|
||||
assert data[key].dtype == target_dtype, '{} has data type {} which '\
|
||||
'is different than {}'.format(key, data[key].dtype, target_dtype)
|
||||
|
||||
|
||||
def _build_key_size_numel_dictionaries(keys, data):
|
||||
"""Build the size on rank 0 and broadcast."""
|
||||
max_dim = _MAX_DATA_DIM
|
||||
sizes = [0 for _ in range(max_dim) for _ in keys]
|
||||
|
||||
# Pack the sizes on rank zero.
|
||||
if get_model_parallel_rank() == 0:
|
||||
offset = 0
|
||||
for key in keys:
|
||||
assert data[key].dim(
|
||||
) < max_dim, 'you should increase MAX_DATA_DIM'
|
||||
size = data[key].size()
|
||||
for i, s in enumerate(size):
|
||||
sizes[i + offset] = s
|
||||
offset += max_dim
|
||||
|
||||
# Move to GPU and broadcast.
|
||||
sizes_cuda = torch.cuda.LongTensor(sizes)
|
||||
torch.distributed.broadcast(
|
||||
sizes_cuda,
|
||||
get_model_parallel_src_rank(),
|
||||
group=get_model_parallel_group())
|
||||
|
||||
# Move back to cpu and unpack.
|
||||
sizes_cpu = sizes_cuda.cpu()
|
||||
key_size = {}
|
||||
key_numel = {}
|
||||
total_numel = 0
|
||||
offset = 0
|
||||
for key in keys:
|
||||
i = 0
|
||||
size = []
|
||||
numel = 1
|
||||
while sizes_cpu[offset + i] > 0:
|
||||
this_size = sizes_cpu[offset + i]
|
||||
size.append(this_size)
|
||||
numel *= this_size
|
||||
i += 1
|
||||
key_size[key] = size
|
||||
key_numel[key] = numel
|
||||
total_numel += numel
|
||||
offset += max_dim
|
||||
|
||||
return key_size, key_numel, total_numel
|
||||
|
||||
|
||||
def broadcast_data(keys, data, datatype):
|
||||
"""Broadcast data from rank zero of each model parallel group to the
|
||||
members of the same model parallel group.
|
||||
|
||||
Arguments:
|
||||
keys: list of keys in the data disctionary to be broadcasted
|
||||
data: data dictionary of string keys and cpu tensor values.
|
||||
datatype: torch data type of all tensors in data associated
|
||||
with keys.
|
||||
"""
|
||||
# Build (key, size) and (key, number of elements) dictionaries along
|
||||
# with the total number of elements on all ranks.
|
||||
key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(
|
||||
keys, data)
|
||||
|
||||
# Pack on rank zero.
|
||||
if get_model_parallel_rank() == 0:
|
||||
# Check that all keys have the same data type.
|
||||
_check_data_types(keys, data, datatype)
|
||||
# Flatten the data associated with the keys
|
||||
flatten_data = torch.cat(
|
||||
[data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
|
||||
else:
|
||||
flatten_data = torch.empty(
|
||||
total_numel, device=torch.cuda.current_device(), dtype=datatype)
|
||||
|
||||
# Boradcast
|
||||
torch.distributed.broadcast(
|
||||
flatten_data,
|
||||
get_model_parallel_src_rank(),
|
||||
group=get_model_parallel_group())
|
||||
|
||||
# Unpack
|
||||
output = {}
|
||||
offset = 0
|
||||
for key in keys:
|
||||
size = key_size[key]
|
||||
numel = key_numel[key]
|
||||
output[key] = flatten_data.narrow(0, offset, numel).view(size)
|
||||
offset += numel
|
||||
|
||||
return output
|
||||
72
modelscope/models/nlp/txl_poem/gpt2/mpu/grads.py
Executable file
72
modelscope/models/nlp/txl_poem/gpt2/mpu/grads.py
Executable file
@@ -0,0 +1,72 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Parts of the code here are adapted from PyTorch
|
||||
# repo: https://github.com/pytorch/pytorch
|
||||
|
||||
import torch
|
||||
from torch._six import inf
|
||||
|
||||
from .initialize import get_model_parallel_group, get_model_parallel_rank
|
||||
|
||||
|
||||
def clip_grad_norm(parameters, max_norm, norm_type=2):
|
||||
"""Clips gradient norm of an iterable of parameters.
|
||||
|
||||
This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
|
||||
added functionality to handle model parallel parameters. Note that
|
||||
the gradients are modified in place.
|
||||
|
||||
Arguments:
|
||||
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
|
||||
single Tensor that will have gradients normalized
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
|
||||
infinity norm.
|
||||
|
||||
Returns:
|
||||
Total norm of the parameters (viewed as a single vector).
|
||||
"""
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
parameters = [parameters]
|
||||
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
||||
max_norm = float(max_norm)
|
||||
norm_type = float(norm_type)
|
||||
if norm_type == inf:
|
||||
total_norm = max(p.grad.data.abs().max() for p in parameters)
|
||||
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
|
||||
# Take max across all GPUs.
|
||||
torch.distributed.all_reduce(
|
||||
total_norm_cuda,
|
||||
op=torch.distributed.ReduceOp.MAX,
|
||||
group=get_model_parallel_group())
|
||||
total_norm = total_norm_cuda[0].item()
|
||||
else:
|
||||
total_norm = 0
|
||||
for p in parameters:
|
||||
if p.model_parallel or (get_model_parallel_rank() == 0):
|
||||
param_norm = p.grad.data.norm(norm_type)
|
||||
total_norm += param_norm.item()**norm_type
|
||||
# Sum across all model parallel GPUs.
|
||||
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
|
||||
torch.distributed.all_reduce(
|
||||
total_norm_cuda,
|
||||
op=torch.distributed.ReduceOp.SUM,
|
||||
group=get_model_parallel_group())
|
||||
total_norm = total_norm_cuda[0].item()**(1. / norm_type)
|
||||
clip_coef = max_norm / (total_norm + 1e-6)
|
||||
if clip_coef < 1:
|
||||
for p in parameters:
|
||||
p.grad.data.mul_(clip_coef)
|
||||
return total_norm
|
||||
130
modelscope/models/nlp/txl_poem/gpt2/mpu/initialize.py
Executable file
130
modelscope/models/nlp/txl_poem/gpt2/mpu/initialize.py
Executable file
@@ -0,0 +1,130 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Model and data parallel groups."""
|
||||
|
||||
import torch
|
||||
|
||||
from .utils import ensure_divisibility
|
||||
|
||||
# Model parallel group that the current rank belongs to.
|
||||
_MODEL_PARALLEL_GROUP = None
|
||||
# Data parallel group that the current rank belongs to.
|
||||
_DATA_PARALLEL_GROUP = None
|
||||
|
||||
|
||||
def initialize_model_parallel(model_parallel_size_):
|
||||
"""
|
||||
Initialize model data parallel groups.
|
||||
|
||||
Arguments:
|
||||
model_parallel_size: number of GPUs used to parallelize model.
|
||||
|
||||
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
|
||||
use 2 GPUs to parallelize the model. The present function will
|
||||
create 4 model parallel groups and 2 data parallel grous as:
|
||||
4 model parallel groups:
|
||||
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
|
||||
2 data parallel groups:
|
||||
[g0, g2, g4, g6], [g1, g3, g5, g7]
|
||||
Note that for efficiency, the caller should make sure adjacent ranks
|
||||
are on the same DGX box. For example if we are using 2 DGX-1 boxes
|
||||
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
|
||||
ranks 8 to 15 belong to the second box.
|
||||
"""
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> initializing model parallel with size {}'.format(
|
||||
model_parallel_size_))
|
||||
# Get world size and rank. Ensure some consistencies.
|
||||
assert torch.distributed.is_initialized()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
model_parallel_size = min(model_parallel_size_, world_size)
|
||||
ensure_divisibility(world_size, model_parallel_size)
|
||||
rank = torch.distributed.get_rank()
|
||||
|
||||
# Build the data parallel groups.
|
||||
global _DATA_PARALLEL_GROUP
|
||||
assert _DATA_PARALLEL_GROUP is None, \
|
||||
'data parallel group is already initialized'
|
||||
for i in range(model_parallel_size):
|
||||
ranks = range(i, world_size, model_parallel_size)
|
||||
group = torch.distributed.new_group(ranks)
|
||||
if i == (rank % model_parallel_size):
|
||||
_DATA_PARALLEL_GROUP = group
|
||||
|
||||
# Build the model parallel groups.
|
||||
global _MODEL_PARALLEL_GROUP
|
||||
assert _MODEL_PARALLEL_GROUP is None, \
|
||||
'model parallel group is already initialized'
|
||||
for i in range(world_size // model_parallel_size):
|
||||
ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
|
||||
group = torch.distributed.new_group(ranks)
|
||||
if i == (rank // model_parallel_size):
|
||||
_MODEL_PARALLEL_GROUP = group
|
||||
|
||||
|
||||
def model_parallel_is_initialized():
|
||||
"""Check if model and data parallel groups are initialized."""
|
||||
if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def get_model_parallel_group():
|
||||
"""Get the model parallel group the caller rank belongs to."""
|
||||
assert _MODEL_PARALLEL_GROUP is not None, \
|
||||
'model parallel group is not initialized'
|
||||
return _MODEL_PARALLEL_GROUP
|
||||
|
||||
|
||||
def get_data_parallel_group():
|
||||
"""Get the data parallel group the caller rank belongs to."""
|
||||
assert _DATA_PARALLEL_GROUP is not None, \
|
||||
'data parallel group is not initialized'
|
||||
return _DATA_PARALLEL_GROUP
|
||||
|
||||
|
||||
def get_model_parallel_world_size():
|
||||
"""Return world size for the model parallel group."""
|
||||
return torch.distributed.get_world_size(group=get_model_parallel_group())
|
||||
|
||||
|
||||
def get_model_parallel_rank():
|
||||
"""Return my rank for the model parallel group."""
|
||||
return torch.distributed.get_rank(group=get_model_parallel_group())
|
||||
|
||||
|
||||
def get_model_parallel_src_rank():
|
||||
"""Calculate the global rank corresponding to a local rank zeor
|
||||
in the model parallel group."""
|
||||
global_rank = torch.distributed.get_rank()
|
||||
local_world_size = get_model_parallel_world_size()
|
||||
return (global_rank // local_world_size) * local_world_size
|
||||
|
||||
|
||||
def get_data_parallel_world_size():
|
||||
"""Return world size for the data parallel group."""
|
||||
return torch.distributed.get_world_size(group=get_data_parallel_group())
|
||||
|
||||
|
||||
def get_data_parallel_rank():
|
||||
"""Return my rank for the data parallel group."""
|
||||
return torch.distributed.get_rank(group=get_data_parallel_group())
|
||||
|
||||
|
||||
def destroy_model_parallel():
|
||||
"""Set the groups to none."""
|
||||
global _MODEL_PARALLEL_GROUP
|
||||
_MODEL_PARALLEL_GROUP = None
|
||||
global _DATA_PARALLEL_GROUP
|
||||
_DATA_PARALLEL_GROUP = None
|
||||
358
modelscope/models/nlp/txl_poem/gpt2/mpu/layers.py
Executable file
358
modelscope/models/nlp/txl_poem/gpt2/mpu/layers.py
Executable file
@@ -0,0 +1,358 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Parts of the code here are adapted from PyTorch
|
||||
# repo: https://github.com/pytorch/pytorch
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn.init as init
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from .initialize import get_model_parallel_rank, get_model_parallel_world_size
|
||||
from .mappings import (copy_to_model_parallel_region,
|
||||
gather_from_model_parallel_region,
|
||||
reduce_from_model_parallel_region,
|
||||
scatter_to_model_parallel_region)
|
||||
from .random import get_cuda_rng_tracker
|
||||
from .utils import VocabUtility, divide, split_tensor_along_last_dim
|
||||
|
||||
|
||||
def _initialize_affine_weight(weight,
|
||||
output_size,
|
||||
input_size,
|
||||
per_partition_size,
|
||||
partition_dim,
|
||||
init_method,
|
||||
stride=1,
|
||||
return_master_weight=False):
|
||||
"""Initialize affine weight for model parallel.
|
||||
|
||||
Build the master weight on all processes and scatter
|
||||
the relevant chunk."""
|
||||
# If we only use 1 process for model parallelism, bypass scatter.
|
||||
world_size = get_model_parallel_world_size()
|
||||
if world_size == 1:
|
||||
init_method(weight)
|
||||
if return_master_weight:
|
||||
return weight
|
||||
return None
|
||||
|
||||
# Initialize master weight
|
||||
master_weight = torch.empty(
|
||||
output_size, input_size, dtype=weight.dtype, requires_grad=False)
|
||||
init_method(master_weight)
|
||||
|
||||
# Split and copy
|
||||
per_partition_per_stride_size = divide(per_partition_size, stride)
|
||||
weight_list = torch.split(
|
||||
master_weight, per_partition_per_stride_size, dim=partition_dim)
|
||||
rank = get_model_parallel_rank()
|
||||
my_weight_list = weight_list[rank::world_size]
|
||||
|
||||
with torch.no_grad():
|
||||
torch.cat(my_weight_list, dim=partition_dim, out=weight)
|
||||
if return_master_weight:
|
||||
return master_weight
|
||||
return None
|
||||
|
||||
|
||||
class VocabParallelEmbedding(torch.nn.Module):
|
||||
"""Embedding parallelized in the vocabulary dimension.
|
||||
|
||||
This is mainly adapted from torch.nn.Embedding and all the default
|
||||
values are kept.
|
||||
Arguments:
|
||||
num_embeddings: vocabulary size.
|
||||
embedding_dim: size of hidden state.
|
||||
init_method: method to initialize weights.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_embeddings,
|
||||
embedding_dim,
|
||||
init_method=init.xavier_normal_):
|
||||
super(VocabParallelEmbedding, self).__init__()
|
||||
# Keep the input dimensions.
|
||||
self.num_embeddings = num_embeddings
|
||||
self.embedding_dim = embedding_dim
|
||||
# Set the detauls for compatibility.
|
||||
self.padding_idx = None
|
||||
self.max_norm = None
|
||||
self.norm_type = 2.
|
||||
self.scale_grad_by_freq = False
|
||||
self.sparse = False
|
||||
self._weight = None
|
||||
# Divide the weight matrix along the vocaburaly dimension.
|
||||
self.vocab_start_index, self.vocab_end_index = \
|
||||
VocabUtility.vocab_range_from_global_vocab_size(
|
||||
self.num_embeddings, get_model_parallel_rank(),
|
||||
get_model_parallel_world_size())
|
||||
self.num_embeddings_per_partition = self.vocab_end_index - \
|
||||
self.vocab_start_index # noqa
|
||||
|
||||
# Allocate weights.
|
||||
self.weight = Parameter(
|
||||
torch.Tensor(self.num_embeddings_per_partition,
|
||||
self.embedding_dim))
|
||||
self.weight.model_parallel = True
|
||||
# And initialize.
|
||||
_initialize_affine_weight(self.weight, self.num_embeddings,
|
||||
self.embedding_dim,
|
||||
self.num_embeddings_per_partition, 0,
|
||||
init_method)
|
||||
|
||||
def forward(self, input_):
|
||||
# Build the mask.
|
||||
input_mask = (input_ < self.vocab_start_index) | \
|
||||
(input_ >= self.vocab_end_index)
|
||||
# Mask the input.
|
||||
masked_input = input_.clone() - self.vocab_start_index
|
||||
masked_input[input_mask] = 0
|
||||
# Get the embeddings.
|
||||
output_parallel = F.embedding(masked_input, self.weight,
|
||||
self.padding_idx, self.max_norm,
|
||||
self.norm_type, self.scale_grad_by_freq,
|
||||
self.sparse)
|
||||
# Mask the output embedding.
|
||||
output_parallel[input_mask, :] = 0.0
|
||||
# Reduce across all the model parallel GPUs.
|
||||
output = reduce_from_model_parallel_region(output_parallel)
|
||||
return output
|
||||
|
||||
|
||||
class ParallelEmbedding(torch.nn.Module):
|
||||
"""Embedding parallelized in the embedding dimension.
|
||||
|
||||
This is mainly adapted from torch.nn.Embedding and all the default
|
||||
values are kept.
|
||||
Arguments:
|
||||
num_embeddings: vocabulary size.
|
||||
embedding_dim: size of hidden state.
|
||||
init_method: method to initialize weights.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_embeddings,
|
||||
embedding_dim,
|
||||
init_method=init.xavier_normal_,
|
||||
keep_master_weight_for_test=False):
|
||||
super(ParallelEmbedding, self).__init__()
|
||||
# Keep the input dimensions.
|
||||
self.num_embeddings = num_embeddings
|
||||
self.embedding_dim = embedding_dim
|
||||
# Set some detauls for compatibility.
|
||||
self.padding_idx = None
|
||||
self.max_norm = None
|
||||
self.norm_type = 2.
|
||||
self.scale_grad_by_freq = False
|
||||
self.sparse = False
|
||||
self._weight = None
|
||||
# Divide the weight matrix along the embedding dimension.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.embedding_dim_per_partition = divide(self.embedding_dim,
|
||||
world_size)
|
||||
|
||||
# Allocate weights.
|
||||
self.weight = Parameter(
|
||||
torch.Tensor(self.num_embeddings,
|
||||
self.embedding_dim_per_partition))
|
||||
self.weight.model_parallel = True
|
||||
# And initialize.
|
||||
_initialize_affine_weight(
|
||||
self.weight,
|
||||
self.num_embeddings,
|
||||
self.embedding_dim,
|
||||
self.embedding_dim_per_partition,
|
||||
1,
|
||||
init_method,
|
||||
stride=1,
|
||||
return_master_weight=False)
|
||||
|
||||
def forward(self, input_):
|
||||
input_parallel = copy_to_model_parallel_region(input_)
|
||||
output_parallel = F.embedding(input_parallel, self.weight,
|
||||
self.padding_idx, self.max_norm,
|
||||
self.norm_type, self.scale_grad_by_freq,
|
||||
self.sparse)
|
||||
output = gather_from_model_parallel_region(output_parallel)
|
||||
return output
|
||||
|
||||
|
||||
class ColumnParallelLinear(torch.nn.Module):
|
||||
"""Linear layer with column parallelism.
|
||||
|
||||
The linear layer is defined as Y = XA + b. A is parallelized along
|
||||
its second dimension as A = [A_1, ..., A_p].
|
||||
|
||||
Arguments:
|
||||
input_size: first dimension of matrix A.
|
||||
output_size: second dimension of matrix A.
|
||||
bias: If true, add bias
|
||||
gather_output: If true, call all-gether on output and make Y avaiable
|
||||
to all GPUs, otherwise, every GPU will have its output
|
||||
which is Y_i = XA_i
|
||||
init_method: method to initialize weights. Note that bias is always set
|
||||
to zero.
|
||||
stride: For the strided linear layers.
|
||||
keep_master_weight_for_test: This was added for testing and should be
|
||||
set to False. It returns the master weights
|
||||
used for initialization.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
input_size,
|
||||
output_size,
|
||||
bias=True,
|
||||
gather_output=True,
|
||||
init_method=init.xavier_normal_,
|
||||
stride=1,
|
||||
keep_master_weight_for_test=False):
|
||||
super(ColumnParallelLinear, self).__init__()
|
||||
|
||||
# Keep input parameters
|
||||
self.input_size = input_size
|
||||
self.output_size = output_size
|
||||
self.gather_output = gather_output
|
||||
# Divide the weight matrix along the last dimension.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.output_size_per_partition = divide(output_size, world_size)
|
||||
|
||||
# Parameters.
|
||||
# Note: torch.nn.functional.linear performs XA^T + b and as a result
|
||||
# we allocate the transpose.
|
||||
self.weight = Parameter(
|
||||
torch.Tensor(self.output_size_per_partition, self.input_size))
|
||||
self.weight.model_parallel = True
|
||||
if bias:
|
||||
self.bias = Parameter(torch.Tensor(self.output_size_per_partition))
|
||||
self.bias.model_parallel = True
|
||||
# Always initialize bias to zero.
|
||||
with torch.no_grad():
|
||||
self.bias.zero_()
|
||||
else:
|
||||
self.register_parameter('bias', None)
|
||||
|
||||
# Initialize weight.
|
||||
self.master_weight = _initialize_affine_weight(
|
||||
self.weight,
|
||||
self.output_size,
|
||||
self.input_size,
|
||||
self.output_size_per_partition,
|
||||
0,
|
||||
init_method,
|
||||
stride=stride,
|
||||
return_master_weight=keep_master_weight_for_test)
|
||||
|
||||
def forward(self, input_):
|
||||
# Set up backprop all-reduce.
|
||||
input_parallel = copy_to_model_parallel_region(input_)
|
||||
# Matrix multiply.
|
||||
output_parallel = F.linear(input_parallel, self.weight, self.bias)
|
||||
if self.gather_output:
|
||||
# All-gather across the partitions.
|
||||
output = gather_from_model_parallel_region(output_parallel)
|
||||
else:
|
||||
output = output_parallel
|
||||
return output
|
||||
|
||||
|
||||
class RowParallelLinear(torch.nn.Module):
|
||||
"""Linear layer with row parallelism.
|
||||
|
||||
The linear layer is defined as Y = XA + b. A is parallelized along
|
||||
its first dimension and X along its second dimension as:
|
||||
- -
|
||||
| A_1 |
|
||||
| . |
|
||||
A = | . | X = [X_1, ..., X_p]
|
||||
| . |
|
||||
| A_p |
|
||||
- -
|
||||
Arguments:
|
||||
input_size: first dimension of matrix A.
|
||||
output_size: second dimension of matrix A.
|
||||
bias: If true, add bias. Note that bias is not parallelized.
|
||||
input_is_parallel: If true, we assume that the input is already
|
||||
split across the GPUs and we do not split
|
||||
again.
|
||||
init_method: method to initialize weights. Note that bias is always set
|
||||
to zero.
|
||||
stride: For the strided linear layers.
|
||||
keep_master_weight_for_test: This was added for testing and should be
|
||||
set to False. It returns the master weights
|
||||
used for initialization.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
input_size,
|
||||
output_size,
|
||||
bias=True,
|
||||
input_is_parallel=False,
|
||||
init_method=init.xavier_normal_,
|
||||
stride=1,
|
||||
keep_master_weight_for_test=False):
|
||||
super(RowParallelLinear, self).__init__()
|
||||
|
||||
# Keep input parameters
|
||||
self.input_size = input_size
|
||||
self.output_size = output_size
|
||||
self.input_is_parallel = input_is_parallel
|
||||
# Divide the weight matrix along the last dimension.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.input_size_per_partition = divide(input_size, world_size)
|
||||
|
||||
# Parameters.
|
||||
# Note: torch.nn.functional.linear performs XA^T + b and as a result
|
||||
# we allocate the transpose.
|
||||
self.weight = Parameter(
|
||||
torch.Tensor(self.output_size, self.input_size_per_partition))
|
||||
self.weight.model_parallel = True
|
||||
if bias:
|
||||
self.bias = Parameter(torch.Tensor(self.output_size))
|
||||
# Always initialize bias to zero.
|
||||
with torch.no_grad():
|
||||
self.bias.zero_()
|
||||
else:
|
||||
self.register_parameter('bias', None)
|
||||
|
||||
# Initialize weight.
|
||||
self.master_weight = _initialize_affine_weight(
|
||||
self.weight,
|
||||
self.output_size,
|
||||
self.input_size,
|
||||
self.input_size_per_partition,
|
||||
1,
|
||||
init_method,
|
||||
stride=stride,
|
||||
return_master_weight=keep_master_weight_for_test)
|
||||
|
||||
def forward(self, input_):
|
||||
# Set up backprop all-reduce.
|
||||
if self.input_is_parallel:
|
||||
input_parallel = input_
|
||||
else:
|
||||
input_parallel = scatter_to_model_parallel_region(input_)
|
||||
# Matrix multiply.
|
||||
output_parallel = F.linear(input_parallel, self.weight)
|
||||
# All-reduce across all the partitions.
|
||||
output_ = reduce_from_model_parallel_region(output_parallel)
|
||||
if self.bias is not None:
|
||||
output = output_ + self.bias
|
||||
else:
|
||||
output = output_
|
||||
return output
|
||||
144
modelscope/models/nlp/txl_poem/gpt2/mpu/mappings.py
Executable file
144
modelscope/models/nlp/txl_poem/gpt2/mpu/mappings.py
Executable file
@@ -0,0 +1,144 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
from .initialize import get_model_parallel_group
|
||||
from .utils import split_tensor_along_last_dim
|
||||
|
||||
|
||||
def _reduce(input_):
|
||||
"""All-reduce the the input tensor across model parallel group."""
|
||||
group = get_model_parallel_group()
|
||||
|
||||
# Bypass the function if we are using only 1 GPU.
|
||||
if torch.distributed.get_world_size(group=group) == 1:
|
||||
return input_
|
||||
|
||||
# All-reduce.
|
||||
torch.distributed.all_reduce(input_, group=group)
|
||||
|
||||
return input_
|
||||
|
||||
|
||||
def _split(input_):
|
||||
"""Split the tensor along its last dimension and keep the
|
||||
corresponding slice."""
|
||||
group = get_model_parallel_group()
|
||||
|
||||
# Bypass the function if we are using only 1 GPU.
|
||||
if torch.distributed.get_world_size(group=group) == 1:
|
||||
return input_
|
||||
|
||||
# Split along last dimension.
|
||||
world_size = torch.distributed.get_world_size(group=group)
|
||||
input_list = split_tensor_along_last_dim(input_, world_size)
|
||||
|
||||
# Note: torch.split does not create contiguous tensors by default.
|
||||
rank = torch.distributed.get_rank(group=group)
|
||||
output = input_list[rank].contiguous()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def _gather(input_):
|
||||
"""Gather tensors and concatinate along the last dimension."""
|
||||
group = get_model_parallel_group()
|
||||
|
||||
# Bypass the function if we are using only 1 GPU.
|
||||
if torch.distributed.get_world_size(group=group) == 1:
|
||||
return input_
|
||||
|
||||
# Size and dimension.
|
||||
last_dim = input_.dim() - 1
|
||||
rank = torch.distributed.get_rank(group=group)
|
||||
world_size = torch.distributed.get_world_size(group=group)
|
||||
|
||||
tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
|
||||
tensor_list[rank] = input_
|
||||
torch.distributed.all_gather(tensor_list, input_, group=group)
|
||||
|
||||
# Note: torch.cat already creates a contiguous tensor.
|
||||
output = torch.cat(tensor_list, dim=last_dim).contiguous()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class _CopyToModelParallelRegion(torch.autograd.Function):
|
||||
"""Pass the input to the model parallel region."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
return input_
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return _reduce(grad_output)
|
||||
|
||||
|
||||
class _ReduceFromModelParallelRegion(torch.autograd.Function):
|
||||
"""All-redcue the input from the model parallel region."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
return _reduce(input_)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return grad_output
|
||||
|
||||
|
||||
class _ScatterToModelParallelRegion(torch.autograd.Function):
|
||||
"""Split the input and keep only the corresponding chuck to the rank."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
return _split(input_)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return _gather(grad_output)
|
||||
|
||||
|
||||
class _GatherFromModelParallelRegion(torch.autograd.Function):
|
||||
"""Gather the input from model parallel region and concatinate."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
return _gather(input_)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return _split(grad_output)
|
||||
|
||||
|
||||
# -----------------
|
||||
# Helper functions.
|
||||
# -----------------
|
||||
|
||||
|
||||
def copy_to_model_parallel_region(input_):
|
||||
return _CopyToModelParallelRegion.apply(input_)
|
||||
|
||||
|
||||
def reduce_from_model_parallel_region(input_):
|
||||
return _ReduceFromModelParallelRegion.apply(input_)
|
||||
|
||||
|
||||
def scatter_to_model_parallel_region(input_):
|
||||
return _ScatterToModelParallelRegion.apply(input_)
|
||||
|
||||
|
||||
def gather_from_model_parallel_region(input_):
|
||||
return _GatherFromModelParallelRegion.apply(input_)
|
||||
404
modelscope/models/nlp/txl_poem/gpt2/mpu/random.py
Executable file
404
modelscope/models/nlp/txl_poem/gpt2/mpu/random.py
Executable file
@@ -0,0 +1,404 @@
|
||||
# Modified by Samyam Rajbhandari
|
||||
# Used to partition the activations stored for backward propagation
|
||||
# Therefore reduces the memory consumption
|
||||
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Parts of the code here are adapted from PyTorch
|
||||
# repo: https://github.com/pytorch/pytorch
|
||||
import contextlib
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch import _C
|
||||
from torch.cuda import _lazy_call
|
||||
from torch.cuda import device as device_ctx_manager
|
||||
|
||||
from .initialize import (get_data_parallel_rank, get_model_parallel_group,
|
||||
get_model_parallel_rank,
|
||||
get_model_parallel_world_size)
|
||||
|
||||
PARTITION_ACTIVATIONS = False
|
||||
PA_CORRECTNESS_TEST = False
|
||||
|
||||
|
||||
def see_memory_usage(message, force=False):
|
||||
if not force:
|
||||
return
|
||||
dist.barrier()
|
||||
if dist.get_rank() == 0:
|
||||
print(message)
|
||||
print('Memory Allocated ',
|
||||
torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
|
||||
'GigaBytes')
|
||||
print('Max Memory Allocated ',
|
||||
torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
|
||||
'GigaBytes')
|
||||
print('Cache Allocated ',
|
||||
torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
|
||||
print('Max cache Allocated ',
|
||||
torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
|
||||
'GigaBytes')
|
||||
print(' ')
|
||||
|
||||
|
||||
mp_rank = None # get_model_parallel_rank()
|
||||
mp_size = None # get_model_parallel_world_size()
|
||||
mp_group = None # get_model_parallel_group()
|
||||
|
||||
# Default name for the model parallel rng tracker.
|
||||
_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
|
||||
transport_stream = None
|
||||
cuda_device = None
|
||||
|
||||
|
||||
def detach_variable(inputs, device=None):
|
||||
if isinstance(inputs, tuple):
|
||||
out = []
|
||||
for inp in inputs:
|
||||
if not isinstance(inp, torch.Tensor):
|
||||
out.append(inp)
|
||||
continue
|
||||
|
||||
requires_grad = inp.requires_grad
|
||||
|
||||
if device is not None:
|
||||
x = inp.to(device=device)
|
||||
else:
|
||||
x = inp
|
||||
|
||||
x = x.detach()
|
||||
x.requires_grad = requires_grad
|
||||
out.append(x)
|
||||
return tuple(out)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
'Only tuple of tensors is supported. Got Unsupported input type: ',
|
||||
type(inputs).__name__)
|
||||
|
||||
|
||||
def _set_cuda_rng_state(new_state, device=-1):
|
||||
"""Sets the random number generator state of the current GPU.
|
||||
|
||||
Argumentss:
|
||||
new_state (torch.ByteTensor): The desired state
|
||||
This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
|
||||
with a single change: the input state is not cloned. Cloning caused
|
||||
major performance issues for +4 GPU cases.
|
||||
"""
|
||||
if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
|
||||
# older PyTorch
|
||||
def cb():
|
||||
with device_ctx_manager(device):
|
||||
_C._cuda_setRNGState(new_state)
|
||||
else:
|
||||
# newer PyTorch
|
||||
if device == -1:
|
||||
device = torch.device('cuda')
|
||||
elif isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
elif isinstance(device, int):
|
||||
device = torch.device('cuda', device)
|
||||
|
||||
def cb():
|
||||
idx = device.index
|
||||
if idx is None:
|
||||
idx = torch.cuda.current_device()
|
||||
default_generator = torch.cuda.default_generators[idx]
|
||||
default_generator.set_state(new_state)
|
||||
|
||||
_lazy_call(cb)
|
||||
|
||||
|
||||
class CudaRNGStatesTracker:
|
||||
"""Tracker for the cuda RNG states.
|
||||
|
||||
Using the `add` method, a cuda rng state is initialized based on
|
||||
the input `seed` and is assigned to `name`. Later, by forking the
|
||||
rng state, we can perform operations and return to our starting
|
||||
cuda state.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Map from a string name to the cuda rng state.
|
||||
self.states_ = {}
|
||||
# Seeds are just for book keeping and ensure no seed is set twice.
|
||||
self.seeds_ = set()
|
||||
|
||||
def reset(self):
|
||||
"""Set to the initial state (no tracker)."""
|
||||
self.states_ = {}
|
||||
self.seeds_ = set()
|
||||
|
||||
def get_states(self):
|
||||
"""Get rng states. Copy the dictionary so we have direct
|
||||
pointers to the states, not just a pointer to the dictionary."""
|
||||
states = {}
|
||||
for name in self.states_:
|
||||
states[name] = self.states_[name]
|
||||
return states
|
||||
|
||||
def set_states(self, states):
|
||||
"""Set the rng states. For efficiency purposes, we do not check
|
||||
the size of seed for compatibility."""
|
||||
self.states_ = states
|
||||
|
||||
def add(self, name, seed):
|
||||
"""Track the rng state."""
|
||||
# Check seed is not already used.
|
||||
if seed in self.seeds_:
|
||||
raise Exception('seed {} already exists'.format(seed))
|
||||
self.seeds_.add(seed)
|
||||
# Check that state is not already defined.
|
||||
if name in self.states_:
|
||||
raise Exception('cuda rng state {} already exists'.format(name))
|
||||
# Get the current rng state.
|
||||
orig_rng_state = torch.cuda.get_rng_state()
|
||||
# Set the new state and store it.
|
||||
torch.cuda.manual_seed(seed)
|
||||
self.states_[name] = torch.cuda.get_rng_state()
|
||||
# Reset rng state to what it was.
|
||||
_set_cuda_rng_state(orig_rng_state)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
|
||||
"""Fork the cuda rng state, perform operations, and exit with
|
||||
the original state."""
|
||||
# Check if we have added the state
|
||||
if name not in self.states_:
|
||||
raise Exception('cuda rng state {} is not added'.format(name))
|
||||
# Store current rng state.
|
||||
orig_cuda_rng_state = torch.cuda.get_rng_state()
|
||||
# Set rng state to the desired one
|
||||
_set_cuda_rng_state(self.states_[name])
|
||||
# Do the stuff we wanted to do.
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
# Update the current rng state for later use.
|
||||
self.states_[name] = torch.cuda.get_rng_state()
|
||||
# And set the state to the original state we started with.
|
||||
_set_cuda_rng_state(orig_cuda_rng_state)
|
||||
|
||||
|
||||
# RNG tracker object.
|
||||
_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
|
||||
|
||||
|
||||
def get_cuda_rng_tracker():
|
||||
"""Get cuda rng tracker."""
|
||||
return _CUDA_RNG_STATE_TRACKER
|
||||
|
||||
|
||||
def model_parallel_cuda_manual_seed(seed):
|
||||
"""Initialize model parallel cuda seed.
|
||||
|
||||
This function should be called after the model parallel is
|
||||
initialized. Also, no torch.cuda.manual_seed should be called
|
||||
after this function. Basically, this is replacement for that
|
||||
function.
|
||||
Two set of RNG states are tracked:
|
||||
default state: This is for data parallelism and is the same among a
|
||||
set of model parallel GPUs but different across
|
||||
different model paralle groups. This is used for
|
||||
example for dropout in the non-model-parallel regions.
|
||||
model-parallel state: This state is different among a set of model
|
||||
parallel GPUs, but the same across data parallel
|
||||
groups. This is used for example for dropout in
|
||||
model parallel regions.
|
||||
"""
|
||||
# 2718 is just for fun and any POSITIVE value will work.
|
||||
offset = seed + 2718
|
||||
model_parallel_seed = offset + get_model_parallel_rank()
|
||||
# Data parallel gets the original sedd.
|
||||
data_parallel_seed = seed
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(
|
||||
'> initializing model parallel cuda seeds on global rank {}, '
|
||||
'model parallel rank {}, and data parallel rank {} with '
|
||||
'model parallel seed: {} and data parallel seed: {}'.format(
|
||||
torch.distributed.get_rank(), get_model_parallel_rank(),
|
||||
get_data_parallel_rank(), model_parallel_seed,
|
||||
data_parallel_seed),
|
||||
flush=True)
|
||||
_CUDA_RNG_STATE_TRACKER.reset()
|
||||
# Set the default state.
|
||||
torch.cuda.manual_seed(data_parallel_seed)
|
||||
# and model parallel state.
|
||||
_CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
|
||||
model_parallel_seed)
|
||||
|
||||
|
||||
def get_partition_start(item):
|
||||
global mp_rank, mp_size, mp_group
|
||||
partition_size = get_partition_size(item)
|
||||
start = partition_size * mp_rank
|
||||
return int(start)
|
||||
|
||||
|
||||
def get_partition_size(item):
|
||||
global mp_rank, mp_size, mp_group
|
||||
size = item.numel()
|
||||
partition_size = size / mp_size
|
||||
return int(partition_size)
|
||||
|
||||
|
||||
def get_full_inputs(tensors):
|
||||
inputs = []
|
||||
for i in range(int(len(tensors) / 2) - 1):
|
||||
item = tensors[2 * i]
|
||||
size = tensors[2 * i + 1]
|
||||
partition_size = item.numel()
|
||||
tensor_size = partition_size * mp_size
|
||||
flat_tensor = torch.zeros([tensor_size],
|
||||
dtype=item.dtype,
|
||||
device=item.device)
|
||||
partitions = []
|
||||
for i in range(mp_size):
|
||||
part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
|
||||
if i == mp_rank:
|
||||
part_i.copy_(item)
|
||||
partitions.append(part_i)
|
||||
dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
|
||||
input_tensor = flat_tensor.view(list(size.numpy()))
|
||||
item.data = input_tensor.data
|
||||
|
||||
inputs.append(item)
|
||||
inputs.append(tensors[-2])
|
||||
|
||||
return tuple(inputs)
|
||||
|
||||
|
||||
class CheckpointFunction(torch.autograd.Function):
|
||||
"""This function is adapted from torch.utils.checkpoint with
|
||||
two main changes:
|
||||
1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
|
||||
2) the states in the model parallel tracker are also properly
|
||||
tracked/set/reset.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, run_function, *args):
|
||||
ctx.run_function = run_function
|
||||
global mp_rank, mp_size, mp_group
|
||||
if mp_rank is None:
|
||||
mp_rank = get_model_parallel_rank()
|
||||
mp_size = get_model_parallel_world_size()
|
||||
mp_group = get_model_parallel_group()
|
||||
|
||||
global cuda_device, transport_stream, PARTITION_ACTIVATIONS
|
||||
if cuda_device is None:
|
||||
if dist.get_rank() == 0:
|
||||
print(
|
||||
f'Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}'
|
||||
)
|
||||
|
||||
cuda_device = torch.cuda.current_device()
|
||||
# The transport stream is used to overlap the allgather communication for the activations
|
||||
# with the computation in the backward pass
|
||||
transport_stream = torch.cuda.Stream(device=cuda_device)
|
||||
|
||||
if PARTITION_ACTIVATIONS:
|
||||
inputs = [
|
||||
item.detach().contiguous().view(-1).narrow(
|
||||
0, get_partition_start(item),
|
||||
get_partition_size(item)).clone() for item in args[:-1]
|
||||
]
|
||||
inputs.append(args[-1])
|
||||
|
||||
# just in case something funky is happening such as reuse of inputs
|
||||
inputs_cuda = [item.to(cuda_device) for item in args]
|
||||
|
||||
# Copy the rng states.
|
||||
ctx.fwd_cpu_rng_state = torch.get_rng_state()
|
||||
ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
|
||||
ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = run_function(*inputs_cuda)
|
||||
|
||||
del inputs_cuda
|
||||
|
||||
if PARTITION_ACTIVATIONS:
|
||||
new_args = []
|
||||
for arg, inp in zip(args, inputs):
|
||||
size = torch.tensor(arg.size())
|
||||
arg.data = inp.data
|
||||
new_args.append(arg)
|
||||
new_args.append(size)
|
||||
ctx.save_for_backward(*new_args)
|
||||
else:
|
||||
ctx.save_for_backward(*args)
|
||||
|
||||
return outputs
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, *args):
|
||||
if not torch.autograd._is_checkpoint_valid():
|
||||
raise RuntimeError('Checkpointing is not compatible with .grad(), '
|
||||
'please use .backward() if possible')
|
||||
|
||||
global cuda_device, transport_stream, PARTITION_ACTIVATIONS
|
||||
|
||||
if PARTITION_ACTIVATIONS:
|
||||
with torch.cuda.stream(transport_stream):
|
||||
inputs = get_full_inputs(ctx.saved_tensors)
|
||||
detached_inputs = detach_variable(inputs)
|
||||
else:
|
||||
inputs = ctx.saved_tensors
|
||||
detached_inputs = detach_variable(inputs)
|
||||
|
||||
# Store the current states.
|
||||
bwd_cpu_rng_state = torch.get_rng_state()
|
||||
bwd_cuda_rng_state = torch.cuda.get_rng_state()
|
||||
bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
|
||||
|
||||
# Set the states to what it used to be before the forward pass.
|
||||
torch.set_rng_state(ctx.fwd_cpu_rng_state)
|
||||
_set_cuda_rng_state(ctx.fwd_cuda_rng_state)
|
||||
get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
|
||||
|
||||
if PARTITION_ACTIVATIONS:
|
||||
current_stream = torch.cuda.current_stream()
|
||||
current_stream.wait_stream(transport_stream)
|
||||
|
||||
with torch.enable_grad():
|
||||
outputs = ctx.run_function(*detached_inputs)
|
||||
|
||||
# Set the states back to what it was at the start of this function.
|
||||
torch.set_rng_state(bwd_cpu_rng_state)
|
||||
_set_cuda_rng_state(bwd_cuda_rng_state)
|
||||
get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
|
||||
|
||||
if isinstance(outputs, torch.Tensor):
|
||||
outputs = (outputs, )
|
||||
torch.autograd.backward(outputs, args)
|
||||
return (None, ) + tuple(inp.grad for inp in detached_inputs)
|
||||
|
||||
|
||||
def checkpoint(function, *args):
|
||||
"""Checkpoint a model or part of the model.
|
||||
This has been directly copied from torch.utils.checkpoint."""
|
||||
return CheckpointFunction.apply(function, *args)
|
||||
|
||||
|
||||
def partition_activations_in_checkpoint(partition_activation):
|
||||
global PARTITION_ACTIVATIONS
|
||||
PARTITION_ACTIVATIONS = partition_activation
|
||||
if dist.get_rank() == 0:
|
||||
print(
|
||||
f'**************Partition Activations {PARTITION_ACTIVATIONS}************'
|
||||
)
|
||||
0
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/__init__.py
Executable file
0
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/__init__.py
Executable file
86
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/commons.py
Executable file
86
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/commons.py
Executable file
@@ -0,0 +1,86 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
|
||||
import gpt2.mpu as mpu
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
|
||||
class IdentityLayer(torch.nn.Module):
|
||||
|
||||
def __init__(self, size, scale=1.0):
|
||||
super(IdentityLayer, self).__init__()
|
||||
self.weight = torch.nn.Parameter(scale * torch.randn(size))
|
||||
|
||||
def forward(self):
|
||||
return self.weight
|
||||
|
||||
|
||||
def set_random_seed(seed):
|
||||
"""Set random seed for reproducability."""
|
||||
random.seed(seed)
|
||||
numpy.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
mpu.model_parallel_cuda_manual_seed(seed)
|
||||
|
||||
|
||||
def initialize_distributed(backend='nccl'):
|
||||
"""Initialize torch.distributed."""
|
||||
# Get local rank in case it is provided.
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--local_rank',
|
||||
type=int,
|
||||
default=None,
|
||||
help='local rank passed from distributed launcher')
|
||||
args = parser.parse_args()
|
||||
local_rank = args.local_rank
|
||||
|
||||
# Get rank and world size.
|
||||
rank = int(os.getenv('RANK', '0'))
|
||||
world_size = int(os.getenv('WORLD_SIZE', '1'))
|
||||
|
||||
print('> initializing torch.distributed with local rank: {}, '
|
||||
'rank: {}, world size: {}'.format(local_rank, rank, world_size))
|
||||
|
||||
# Set the device id.
|
||||
device = rank % torch.cuda.device_count()
|
||||
if local_rank is not None:
|
||||
device = local_rank
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
# Call the init process.
|
||||
init_method = 'tcp://'
|
||||
master_ip = os.getenv('MASTER_ADDR', 'localhost')
|
||||
master_port = os.getenv('MASTER_PORT', '6000')
|
||||
init_method += master_ip + ':' + master_port
|
||||
torch.distributed.init_process_group(
|
||||
backend=backend,
|
||||
world_size=world_size,
|
||||
rank=rank,
|
||||
init_method=init_method)
|
||||
|
||||
|
||||
def print_separator(message):
|
||||
torch.distributed.barrier()
|
||||
filler_len = (78 - len(message)) // 2
|
||||
filler = '-' * filler_len
|
||||
string = '\n' + filler + ' {} '.format(message) + filler
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(string, flush=True)
|
||||
torch.distributed.barrier()
|
||||
106
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_cross_entropy.py
Executable file
106
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_cross_entropy.py
Executable file
@@ -0,0 +1,106 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import sys
|
||||
|
||||
import gpt2.mpu as mpu
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from commons import (IdentityLayer, initialize_distributed, print_separator,
|
||||
set_random_seed)
|
||||
from mpu.cross_entropy import vocab_parallel_cross_entropy
|
||||
|
||||
sys.path.append('../..')
|
||||
|
||||
|
||||
def torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale,
|
||||
seed):
|
||||
set_random_seed(seed)
|
||||
identity = IdentityLayer((batch_size, seq_length, vocab_size),
|
||||
scale=logits_scale).cuda()
|
||||
logits = identity()
|
||||
target = torch.cuda.LongTensor(size=(batch_size,
|
||||
seq_length)).random_(0, vocab_size)
|
||||
loss = F.cross_entropy(
|
||||
logits.view(-1,
|
||||
logits.size()[-1]), target.view(-1),
|
||||
reduction='none').view_as(target).mean()
|
||||
loss.backward()
|
||||
return loss, identity.weight.grad
|
||||
|
||||
|
||||
def mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
|
||||
set_random_seed(seed)
|
||||
identity = IdentityLayer((batch_size, seq_length, vocab_size),
|
||||
scale=logits_scale).cuda()
|
||||
logits = identity()
|
||||
logits_parallel = mpu.scatter_to_model_parallel_region(logits)
|
||||
target = torch.cuda.LongTensor(size=(batch_size,
|
||||
seq_length)).random_(0, vocab_size)
|
||||
loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
|
||||
loss.backward()
|
||||
return loss, identity.weight.grad
|
||||
|
||||
|
||||
def test_cross_entropy(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing cross entropy with model parallel size {} ...'.format(
|
||||
model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
batch_size = 13
|
||||
seq_length = 17
|
||||
vocab_size_per_partition = 11
|
||||
logits_scale = 1000.0
|
||||
vocab_size = vocab_size_per_partition * model_parallel_size
|
||||
seed = 1234
|
||||
|
||||
loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
|
||||
vocab_size, logits_scale,
|
||||
seed)
|
||||
loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, vocab_size,
|
||||
logits_scale, seed)
|
||||
|
||||
error = loss_torch.sub_(loss_mpu).abs().max()
|
||||
print(' max error in loss on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
error = grad_torch.sub_(grad_mpu).abs().max()
|
||||
print(' max error in grad on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test cross entropy')
|
||||
test_cross_entropy(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
91
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_data.py
Executable file
91
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_data.py
Executable file
@@ -0,0 +1,91 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
import operator
|
||||
import sys
|
||||
|
||||
import gpt2.mpu as mpu
|
||||
import torch
|
||||
from commons import initialize_distributed, print_separator
|
||||
from mpu import data as data_utils
|
||||
|
||||
sys.path.append('../..')
|
||||
|
||||
|
||||
def test_boradcast_data(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(
|
||||
'> testing boradcast_data with model parallel size {} ...'.format(
|
||||
model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
torch.manual_seed(1234 + mpu.get_data_parallel_rank())
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
key_size_t = {
|
||||
'key1': [7, 11],
|
||||
'key2': [8, 2, 1],
|
||||
'key3': [13],
|
||||
'key4': [5, 1, 2],
|
||||
'key5': [5, 12]
|
||||
}
|
||||
keys = list(key_size_t.keys())
|
||||
|
||||
data = {}
|
||||
data_t = {}
|
||||
for key in key_size_t:
|
||||
data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
|
||||
data_t[key] = data[key].clone()
|
||||
data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
|
||||
data_t['keyX'] = data['keyX'].clone()
|
||||
if mpu.get_model_parallel_rank() != 0:
|
||||
data = None
|
||||
|
||||
data_utils._check_data_types(keys, data_t, torch.int64)
|
||||
key_size, key_numel, \
|
||||
total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
|
||||
for key in keys:
|
||||
assert key_size[key] == key_size_t[key]
|
||||
total_numel_t = 0
|
||||
for key in keys:
|
||||
target_size = functools.reduce(operator.mul, key_size_t[key], 1)
|
||||
assert key_numel[key] == target_size
|
||||
total_numel_t += target_size
|
||||
assert total_numel == total_numel_t
|
||||
|
||||
data_b = data_utils.broadcast_data(keys, data, torch.int64)
|
||||
for key in keys:
|
||||
tensor = data_t[key].cuda()
|
||||
assert data_b[key].sub(tensor).abs().max() == 0
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test test boradcast data')
|
||||
test_boradcast_data(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
95
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_initialize.py
Executable file
95
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_initialize.py
Executable file
@@ -0,0 +1,95 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
import gpt2.mpu as mpu
|
||||
import torch
|
||||
from commons import initialize_distributed, print_separator
|
||||
|
||||
sys.path.append('../..')
|
||||
|
||||
|
||||
def test_initialize_model_parallel(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing initialize_model_parallel with size {} ...'.format(
|
||||
model_parallel_size))
|
||||
model_parallel_size_ = min(model_parallel_size,
|
||||
torch.distributed.get_world_size())
|
||||
assert not mpu.model_parallel_is_initialized()
|
||||
mpu.initialize_model_parallel(model_parallel_size_)
|
||||
assert mpu.model_parallel_is_initialized()
|
||||
|
||||
# Checks.
|
||||
def check(group, world_size, rank):
|
||||
assert world_size == torch.distributed.get_world_size(group=group)
|
||||
assert rank == torch.distributed.get_rank(group=group)
|
||||
|
||||
# Model parallel.
|
||||
world_size = model_parallel_size_
|
||||
rank = torch.distributed.get_rank() % model_parallel_size_
|
||||
assert world_size == mpu.get_model_parallel_world_size()
|
||||
assert rank == mpu.get_model_parallel_rank()
|
||||
check(mpu.get_model_parallel_group(), world_size, rank)
|
||||
|
||||
# Data parallel.
|
||||
world_size = torch.distributed.get_world_size() // model_parallel_size_
|
||||
rank = torch.distributed.get_rank() // model_parallel_size
|
||||
assert world_size == mpu.get_data_parallel_world_size()
|
||||
assert rank == mpu.get_data_parallel_rank()
|
||||
check(mpu.get_data_parallel_group(), world_size, rank)
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
def test_get_model_parallel_src_rank(model_parallel_size_):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing get_model_parallel_src_rank with size {} ...'.format(
|
||||
model_parallel_size_))
|
||||
model_parallel_size = min(model_parallel_size_,
|
||||
torch.distributed.get_world_size())
|
||||
assert not mpu.model_parallel_is_initialized()
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
assert mpu.model_parallel_is_initialized()
|
||||
|
||||
# Checks
|
||||
src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
|
||||
assert mpu.get_model_parallel_src_rank() == src_rank
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test initialize model parallel')
|
||||
test_initialize_model_parallel(model_parallel_size)
|
||||
print_separator('test model parallel source rank')
|
||||
test_get_model_parallel_src_rank(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
533
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_layers.py
Executable file
533
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_layers.py
Executable file
@@ -0,0 +1,533 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import sys
|
||||
|
||||
import gpt2.mpu as mpu
|
||||
import torch
|
||||
import torch.nn.init as init
|
||||
from commons import initialize_distributed, print_separator, set_random_seed
|
||||
from mpu import layers
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
sys.path.append('../..')
|
||||
|
||||
|
||||
def test_parallel_embedding(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing parallel embedding with model parallel size {} ...'.
|
||||
format(model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
batch_size = 17
|
||||
seq_length = 23
|
||||
vocab_size = 48
|
||||
hidden_size = 16
|
||||
seed = 1236
|
||||
|
||||
set_random_seed(123)
|
||||
input_data = torch.LongTensor(size=(batch_size, seq_length)).random_(
|
||||
0, vocab_size).cuda()
|
||||
loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
|
||||
|
||||
set_random_seed(seed)
|
||||
embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
|
||||
|
||||
output = embedding_original(input_data)
|
||||
loss_original = torch.mul(output, loss_weight).sum()
|
||||
loss_original.backward()
|
||||
|
||||
set_random_seed(seed)
|
||||
embedding_parallel = layers.ParallelEmbedding(
|
||||
vocab_size, hidden_size, init_method=init.normal_).cuda()
|
||||
output = embedding_parallel(input_data)
|
||||
loss_parallel = torch.mul(output, loss_weight).sum()
|
||||
loss_parallel.backward()
|
||||
|
||||
set_random_seed(seed)
|
||||
embedding_vocab_parallel = layers.VocabParallelEmbedding(
|
||||
vocab_size, hidden_size, init_method=init.normal_).cuda()
|
||||
output = embedding_vocab_parallel(input_data)
|
||||
loss_vocab_parallel = torch.mul(output, loss_weight).sum()
|
||||
loss_vocab_parallel.backward()
|
||||
|
||||
torch.distributed.barrier()
|
||||
error = loss_parallel.sub(loss_original).abs()
|
||||
print(' error in loss (parallel) on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-12, 'error: {}'.format(error)
|
||||
|
||||
torch.distributed.barrier()
|
||||
error = loss_vocab_parallel.sub(loss_original).abs()
|
||||
print(' error in loss (vocab parallel) on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-12, 'error: {}'.format(error)
|
||||
|
||||
weight_grad_orig = torch.split(embedding_original.weight.grad,
|
||||
hidden_size // model_parallel_size,
|
||||
1)[mpu.get_model_parallel_rank()]
|
||||
error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
|
||||
print(' error in grad (parallel) on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-12, 'error: {}'.format(error)
|
||||
|
||||
weight_grad_orig = torch.split(embedding_original.weight.grad,
|
||||
vocab_size // model_parallel_size,
|
||||
0)[mpu.get_model_parallel_rank()]
|
||||
error = embedding_vocab_parallel.weight.grad.sub(
|
||||
weight_grad_orig).abs().max()
|
||||
print(' error in grad (vocab parallel) on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-12, 'error: {}'.format(error)
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
def test_initialize_affine_weight(model_parallel_size):
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing initialize_affine_weight with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
input_size_coeff = 13
|
||||
input_size = input_size_coeff * model_parallel_size
|
||||
output_size_coeff = 17
|
||||
output_size = output_size_coeff * model_parallel_size
|
||||
|
||||
# ---------------
|
||||
# Column parallel
|
||||
# ---------------
|
||||
weight = torch.empty(output_size_coeff, input_size)
|
||||
set_random_seed(seed)
|
||||
layers._initialize_affine_weight(weight, output_size, input_size,
|
||||
output_size_coeff, 0,
|
||||
torch.nn.init.normal_)
|
||||
# Target.
|
||||
set_random_seed(seed)
|
||||
master_weight = torch.empty(output_size, input_size)
|
||||
torch.nn.init.normal_(master_weight)
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
my_weight = torch.split(
|
||||
master_weight, output_size_coeff, dim=0)[rank].contiguous().clone()
|
||||
|
||||
# Compare.
|
||||
error = weight.sub(my_weight).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' column parallel max error (should be zero) on global rank '
|
||||
'{}: {}'.format(torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# ------------
|
||||
# Row parallel
|
||||
# ------------
|
||||
weight = torch.empty(output_size, input_size_coeff)
|
||||
set_random_seed(seed)
|
||||
mpu.layers._initialize_affine_weight(weight, output_size, input_size,
|
||||
input_size_coeff, 1,
|
||||
torch.nn.init.normal_)
|
||||
# Target.
|
||||
set_random_seed(seed)
|
||||
master_weight = torch.empty(output_size, input_size)
|
||||
torch.nn.init.normal_(master_weight)
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
my_weight = torch.split(
|
||||
master_weight, input_size_coeff, dim=1)[rank].contiguous().clone()
|
||||
|
||||
# Compare.
|
||||
error = weight.sub(my_weight).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' row parallel max error (should be zero) on global rank '
|
||||
'{}: {}'.format(torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
class IdentityLayer2D(torch.nn.Module):
|
||||
|
||||
def __init__(self, m, n):
|
||||
super(IdentityLayer2D, self).__init__()
|
||||
self.weight = Parameter(torch.Tensor(m, n))
|
||||
torch.nn.init.xavier_normal_(self.weight)
|
||||
|
||||
def forward(self):
|
||||
return self.weight
|
||||
|
||||
|
||||
def test_column_parallel_linear(model_parallel_size):
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing ColumnParallelLinear with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
set_random_seed(seed)
|
||||
input_size_coeff = 13
|
||||
input_size = input_size_coeff * model_parallel_size
|
||||
output_size_coeff = 17
|
||||
output_size = output_size_coeff * model_parallel_size
|
||||
batch_size = 7
|
||||
|
||||
# Network
|
||||
identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
|
||||
linear_layer = mpu.ColumnParallelLinear(
|
||||
input_size, output_size, keep_master_weight_for_test=True).cuda()
|
||||
loss_weight = torch.randn([batch_size, output_size]).cuda()
|
||||
# Forward
|
||||
input_ = identity_layer()
|
||||
output = linear_layer(input_)
|
||||
loss = torch.mul(output, loss_weight).sum()
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
# Values.
|
||||
dLdY = loss_weight
|
||||
X = identity_layer.weight
|
||||
A = linear_layer.master_weight.cuda()
|
||||
dLdA = torch.matmul(dLdY.t(), X)
|
||||
dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
|
||||
dLdX = torch.matmul(dLdY, A)
|
||||
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
my_dLdA = torch.split(
|
||||
dLdA, output_size_coeff, dim=0)[rank].contiguous().clone()
|
||||
error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdA on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
my_dLdb = torch.split(
|
||||
dLdb, output_size_coeff, dim=0)[rank].contiguous().clone()
|
||||
error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdb on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
error = dLdX.sub(identity_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdX on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
def test_row_parallel_linear(model_parallel_size):
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing RowParallelLinear with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
set_random_seed(seed)
|
||||
input_size_coeff = 13
|
||||
input_size = input_size_coeff * model_parallel_size
|
||||
output_size_coeff = 17
|
||||
output_size = output_size_coeff * model_parallel_size
|
||||
batch_size = 7
|
||||
|
||||
# Network
|
||||
identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
|
||||
linear_layer = mpu.RowParallelLinear(
|
||||
input_size, output_size, keep_master_weight_for_test=True).cuda()
|
||||
loss_weight = torch.randn([batch_size, output_size]).cuda()
|
||||
# Forward
|
||||
input_ = identity_layer()
|
||||
output = linear_layer(input_)
|
||||
loss = torch.mul(output, loss_weight).sum()
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
# Values.
|
||||
dLdY = loss_weight
|
||||
X = identity_layer.weight
|
||||
A = linear_layer.master_weight.cuda()
|
||||
dLdA = torch.matmul(dLdY.t(), X)
|
||||
dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
|
||||
dLdX = torch.matmul(dLdY, A)
|
||||
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
my_dLdA = torch.split(
|
||||
dLdA, input_size_coeff, dim=1)[rank].contiguous().clone()
|
||||
error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdA on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
error = dLdb.sub(linear_layer.bias.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdb on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
error = dLdX.sub(identity_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdX on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
class IdentityLayer3D(torch.nn.Module):
|
||||
|
||||
def __init__(self, m, n, k):
|
||||
super(IdentityLayer3D, self).__init__()
|
||||
self.weight = Parameter(torch.Tensor(m, n, k))
|
||||
torch.nn.init.xavier_normal_(self.weight)
|
||||
|
||||
def forward(self):
|
||||
return self.weight
|
||||
|
||||
|
||||
def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, dropout_prob, batch_size,
|
||||
sequence_length):
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
set_random_seed(seed)
|
||||
|
||||
num_att_heads = num_att_heads_per_partition * \
|
||||
torch.distributed.get_world_size() # noqa
|
||||
hidden_size = hidden_size_per_att_head * num_att_heads
|
||||
|
||||
# Network
|
||||
identity_layer = IdentityLayer3D(batch_size, sequence_length,
|
||||
hidden_size).cuda()
|
||||
attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
|
||||
dropout_prob).cuda()
|
||||
loss_weight = torch.randn([batch_size, sequence_length,
|
||||
hidden_size]).cuda()
|
||||
attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
|
||||
# Forward
|
||||
input_ = identity_layer()
|
||||
output = attention_layer(input_, attention_mask)
|
||||
loss = torch.mul(output, loss_weight).sum()
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
mpu.destroy_model_parallel()
|
||||
return rank, hidden_size, model_parallel_size, loss, \
|
||||
attention_layer, identity_layer
|
||||
|
||||
|
||||
def test_parallel_self_attention(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing ParallelSelfAttention with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
|
||||
num_att_heads_per_partition = 3
|
||||
hidden_size_per_att_head = 7
|
||||
dropout_prob = 0.0 # has to be zero
|
||||
batch_size = 5
|
||||
sequence_length = 13
|
||||
|
||||
rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
|
||||
attention_layer_1, identity_layer_1 = parallel_self_attention(
|
||||
1, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) # noqa
|
||||
|
||||
rank, hidden_size, model_parallel_size, loss, \
|
||||
attention_layer, identity_layer = parallel_self_attention(
|
||||
model_parallel_size, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) # noqa
|
||||
assert hideen_size_1 == hidden_size
|
||||
|
||||
error = loss_1.sub(loss).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' loss error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-6
|
||||
|
||||
my_lin_grad_list = torch.split(
|
||||
attention_layer_1.query_key_value.weight.grad,
|
||||
hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
|
||||
my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
|
||||
error = my_lin_grad.sub(
|
||||
attention_layer.query_key_value.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' weight gradient error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-6
|
||||
|
||||
error = identity_layer_1.weight.grad.sub(
|
||||
identity_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' input gradient error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-6
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, batch_size,
|
||||
sequence_length):
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
set_random_seed(seed)
|
||||
|
||||
num_att_heads = num_att_heads_per_partition * \
|
||||
torch.distributed.get_world_size() # noqa
|
||||
hidden_size = hidden_size_per_att_head * num_att_heads
|
||||
intermediate_size = 4 * hidden_size
|
||||
|
||||
# Network
|
||||
identity_layer = IdentityLayer3D(batch_size, sequence_length,
|
||||
hidden_size).cuda()
|
||||
transformer_layer = mpu.BertParallelTransformerLayer(
|
||||
hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
|
||||
torch.nn.functional.relu, 1.0e-5).cuda()
|
||||
|
||||
loss_weight = torch.randn([batch_size, sequence_length,
|
||||
hidden_size]).cuda()
|
||||
attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
|
||||
# Forward
|
||||
input_ = identity_layer()
|
||||
output = transformer_layer(input_, attention_mask)
|
||||
loss = torch.mul(output, loss_weight).sum()
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
mpu.destroy_model_parallel()
|
||||
return rank, hidden_size, model_parallel_size, loss, \
|
||||
transformer_layer, identity_layer
|
||||
|
||||
|
||||
def test_parallel_transformer_layer(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing ParallelTransformerLayer with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
|
||||
num_att_heads_per_partition = 3
|
||||
hidden_size_per_att_head = 7
|
||||
batch_size = 5
|
||||
sequence_length = 13
|
||||
|
||||
rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
|
||||
transformer_layer_1, identity_layer_1 = parallel_transformer(
|
||||
1, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, batch_size, sequence_length)
|
||||
|
||||
rank, hidden_size, model_parallel_size, loss, \
|
||||
transformer_layer, identity_layer = parallel_transformer(
|
||||
model_parallel_size, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, batch_size, sequence_length)
|
||||
|
||||
error = loss_1.sub(loss).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' loss error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-5, 'error: {}'.format(error)
|
||||
|
||||
error = identity_layer_1.weight.grad.sub(
|
||||
identity_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' input gradient error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-5, 'error: {}'.format(error)
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
print_separator('test initialize affine weight')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_initialize_affine_weight(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test parallel embedding')
|
||||
test_parallel_embedding(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
print_separator('test column-parallel linear')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_column_parallel_linear(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
print_separator('test row-parallel linear')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_row_parallel_linear(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
print_separator('test parallel self-attention')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_parallel_self_attention(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
print_separator('test parallel transformer')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_parallel_transformer_layer(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
206
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_random.py
Executable file
206
modelscope/models/nlp/txl_poem/gpt2/mpu/tests/test_random.py
Executable file
@@ -0,0 +1,206 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
import gpt2.mpu as mpu
|
||||
import torch
|
||||
from commons import initialize_distributed, print_separator
|
||||
|
||||
sys.path.append('../..')
|
||||
|
||||
|
||||
def test_set_cuda_rng_state(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing set_rng_state with size {} ...'.format(
|
||||
model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
size = 123
|
||||
seed = 1234 # noqa
|
||||
torch.cuda.manual_seed(1234)
|
||||
tensor = torch.cuda.FloatTensor(size)
|
||||
|
||||
# Get the state
|
||||
rng_state = torch.cuda.get_rng_state()
|
||||
rng_state_copy = rng_state.clone()
|
||||
|
||||
# Do some stuff.
|
||||
for _ in range(5):
|
||||
torch.randn(size, out=tensor)
|
||||
result_1 = tensor.clone()
|
||||
|
||||
assert rng_state.sub(rng_state_copy).max() == 0
|
||||
assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
|
||||
|
||||
# State should be different.
|
||||
new_rng_state = torch.cuda.get_rng_state()
|
||||
max_diff = new_rng_state.sub(rng_state).max()
|
||||
print(
|
||||
' max diff in rng state (should be non-zero) on global rank {}: {}'.
|
||||
format(torch.distributed.get_rank(), max_diff))
|
||||
assert max_diff > 0
|
||||
|
||||
# Reset the rng state and do the same stuff.
|
||||
mpu.random._set_cuda_rng_state(rng_state)
|
||||
for _ in range(5):
|
||||
torch.randn(size, out=tensor)
|
||||
mpu.random._set_cuda_rng_state(rng_state)
|
||||
for _ in range(5):
|
||||
torch.randn(size, out=tensor)
|
||||
result_2 = tensor.clone()
|
||||
|
||||
# Results should be the same
|
||||
error = result_2.sub(result_1).abs().max()
|
||||
print(' max error in generated tensors (should be zero) on '
|
||||
'global rank {}: {}'.format(torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Input state should have remained intact.
|
||||
error = rng_state.sub(rng_state_copy).max()
|
||||
print(' max error in rng state (should be zero) on global rank {}: {}'.
|
||||
format(torch.distributed.get_rank(), error))
|
||||
assert error == 0
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
def test_cuda_rng_tracker(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing cuda rng tracker with size {} ...'.format(
|
||||
model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed_1 = 1234
|
||||
seed_2 = 4321
|
||||
size = [12, 21]
|
||||
tensor = torch.cuda.FloatTensor(size)
|
||||
|
||||
# Set to seed_1 and generate two tensors.
|
||||
torch.cuda.manual_seed(seed_1)
|
||||
torch.randn(size, out=tensor)
|
||||
target_11 = tensor.clone()
|
||||
torch.randn(size, out=tensor)
|
||||
target_12 = tensor.clone()
|
||||
|
||||
# Set to seed_2 and generate two tensors.
|
||||
torch.cuda.manual_seed(seed_2)
|
||||
torch.randn(size, out=tensor)
|
||||
target_21 = tensor.clone()
|
||||
torch.randn(size, out=tensor)
|
||||
target_22 = tensor.clone()
|
||||
|
||||
# Now if we interleave seed_1 and seed_2,
|
||||
# we should still get the same tensors
|
||||
torch.cuda.manual_seed(seed_1)
|
||||
mpu.get_cuda_rng_tracker().add('test', seed_2)
|
||||
|
||||
torch.randn(size, out=tensor)
|
||||
result_11 = tensor.clone()
|
||||
|
||||
with mpu.get_cuda_rng_tracker().fork('test'):
|
||||
torch.randn(size, out=tensor)
|
||||
result_21 = tensor.clone()
|
||||
|
||||
torch.randn(size, out=tensor)
|
||||
result_12 = tensor.clone()
|
||||
|
||||
with mpu.get_cuda_rng_tracker().fork('test'):
|
||||
torch.randn(size, out=tensor)
|
||||
result_22 = tensor.clone()
|
||||
|
||||
diff = result_11.sub(result_21).abs().max()
|
||||
diff = min(diff, result_12.sub(result_22).abs().max())
|
||||
print(' max diff in generated tensors (should be non-zero) on '
|
||||
'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
|
||||
assert diff > 1.0e-6
|
||||
error = max(
|
||||
result_11.sub(target_11).abs().max(),
|
||||
result_12.sub(target_12).abs().max())
|
||||
error = max(error, result_21.sub(target_21).abs().max())
|
||||
error = max(error, result_22.sub(target_22).abs().max())
|
||||
print(' max error in generated tensors (should be zero) on '
|
||||
'global rank {}: {}'.format(torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset the tracker
|
||||
mpu.get_cuda_rng_tracker().reset()
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
def test_model_parallel_cuda_manual_seed(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing model parallel cuda manual seed with size {} ...'.
|
||||
format(model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
mpu.model_parallel_cuda_manual_seed(12345)
|
||||
assert torch.cuda.initial_seed() == 12345
|
||||
with mpu.get_cuda_rng_tracker().fork():
|
||||
assert torch.cuda.initial_seed() == (12345 + 2718
|
||||
+ mpu.get_model_parallel_rank())
|
||||
|
||||
# Reset the tracker
|
||||
mpu.get_cuda_rng_tracker().reset()
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test set rng state')
|
||||
test_set_cuda_rng_state(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test cuda rng tracker')
|
||||
test_cuda_rng_tracker(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test model parallel cuda manual seed')
|
||||
test_model_parallel_cuda_manual_seed(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
886
modelscope/models/nlp/txl_poem/gpt2/mpu/transformer.py
Executable file
886
modelscope/models/nlp/txl_poem/gpt2/mpu/transformer.py
Executable file
@@ -0,0 +1,886 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Transformer."""
|
||||
|
||||
import math
|
||||
|
||||
import deepspeed
|
||||
import torch
|
||||
import torch.nn.init as init
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
|
||||
|
||||
from .initialize import get_model_parallel_world_size
|
||||
from .layers import ColumnParallelLinear, RowParallelLinear
|
||||
from .mappings import gather_from_model_parallel_region
|
||||
from .random import checkpoint, get_cuda_rng_tracker
|
||||
from .utils import divide, split_tensor_along_last_dim
|
||||
|
||||
|
||||
class PositionalEmbedding(torch.nn.Module):
|
||||
|
||||
def __init__(self, hidden_size):
|
||||
super(PositionalEmbedding, self).__init__()
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
inv_freq = 1 / (
|
||||
10000**(torch.arange(0.0, hidden_size, 2.0) / hidden_size)) # noqa
|
||||
self.register_buffer('inv_freq', inv_freq)
|
||||
|
||||
def forward(self, pos_seq, bsz=None):
|
||||
sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
|
||||
pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
|
||||
|
||||
if bsz is not None:
|
||||
return pos_emb[None, :, :].expand(bsz, -1, -1)
|
||||
else:
|
||||
return pos_emb[None, :, :]
|
||||
|
||||
|
||||
class GPT2ParallelSelfAttention(torch.nn.Module):
|
||||
"""Parallel self-attention layer for GPT2.
|
||||
|
||||
Self-attention layer takes input with size [b, s, h] where b is
|
||||
the batch size, s is the sequence lenght, and h is the hidden size
|
||||
and creates output of the same size.
|
||||
Arguments:
|
||||
hidden_size: total hidden size of the layer (h).
|
||||
num_attention_heads: number of attention heads (n). Note that we
|
||||
require n to be divisible by number of GPUs
|
||||
used to parallelize the model. Also, we
|
||||
require hidden size to be divisible by n.
|
||||
dropout_prob: dropout probability for the attention scores.
|
||||
init_method: weight initialization.
|
||||
output_layer_init_method: output layer initialization. If None, use
|
||||
`init_method`.
|
||||
We use the following notation:
|
||||
h: hidden_size
|
||||
n: num_attention_heads
|
||||
p: number of partitions
|
||||
np: n/p
|
||||
hp: h/p
|
||||
hn: h/n
|
||||
b: batch size
|
||||
s: sequence length
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
init_method,
|
||||
output_layer_init_method=None,
|
||||
relative_encoding=False):
|
||||
super(GPT2ParallelSelfAttention, self).__init__()
|
||||
# Set output layer initialization if not provided.
|
||||
if output_layer_init_method is None:
|
||||
output_layer_init_method = init_method
|
||||
# Per attention head and per partition values.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.hidden_size_per_partition = divide(hidden_size, world_size)
|
||||
self.hidden_size_per_attention_head = divide(hidden_size,
|
||||
num_attention_heads)
|
||||
self.num_attention_heads_per_partition = divide(
|
||||
num_attention_heads, world_size)
|
||||
self.relative_encoding = relative_encoding
|
||||
# Strided linear layer.
|
||||
self.query_key_value = ColumnParallelLinear(
|
||||
hidden_size,
|
||||
3 * hidden_size,
|
||||
stride=3,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
if relative_encoding:
|
||||
self.relative = ColumnParallelLinear(
|
||||
hidden_size,
|
||||
self.hidden_size_per_partition,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
# Dropout. Note that for a single iteration, this layer will generate
|
||||
# different outputs on different number of parallel partitions but
|
||||
# on average it should not be partition dependent.
|
||||
self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
|
||||
|
||||
# Output.
|
||||
self.dense = RowParallelLinear(
|
||||
hidden_size,
|
||||
hidden_size,
|
||||
input_is_parallel=True,
|
||||
init_method=output_layer_init_method)
|
||||
self.output_dropout = torch.nn.Dropout(output_dropout_prob)
|
||||
|
||||
if deepspeed.checkpointing.is_configured():
|
||||
global get_cuda_rng_tracker, checkpoint
|
||||
get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
|
||||
checkpoint = deepspeed.checkpointing.checkpoint
|
||||
|
||||
def _transpose_for_scores(self, tensor):
|
||||
"""Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
|
||||
size [b, np, s, hn].
|
||||
"""
|
||||
new_tensor_shape = tensor.size()[:-1] + \
|
||||
(self.num_attention_heads_per_partition, # noqa
|
||||
self.hidden_size_per_attention_head) # noqa
|
||||
tensor = tensor.view(*new_tensor_shape)
|
||||
return tensor.permute(0, 2, 1, 3)
|
||||
|
||||
@staticmethod
|
||||
def _rel_shift(x, zero_triu=False):
|
||||
# ql x kl x bsz x h
|
||||
# bsz x h x ql x kl
|
||||
zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1),
|
||||
device=x.device,
|
||||
dtype=x.dtype)
|
||||
x_padded = torch.cat([zero_pad, x], dim=-1)
|
||||
|
||||
x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2))
|
||||
|
||||
x = x_padded[:, :, 1:].view_as(x)
|
||||
|
||||
if zero_triu:
|
||||
ones = torch.ones((x.size(0), x.size(1)))
|
||||
x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None]
|
||||
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def _rel_shift_latest(x: torch.Tensor):
|
||||
ndims = x.dim()
|
||||
x_shape = x.size()
|
||||
row_dim = 2
|
||||
col_dim = row_dim + 1
|
||||
assert col_dim < ndims
|
||||
tgt_shape_1, tgt_shape_2 = [], []
|
||||
for i in range(ndims):
|
||||
if i == row_dim:
|
||||
tgt_shape_1.append(x_shape[col_dim])
|
||||
tgt_shape_2.append(x_shape[row_dim])
|
||||
elif i == col_dim:
|
||||
tgt_shape_1.append(x_shape[row_dim])
|
||||
tgt_shape_2.append(x_shape[col_dim] - 1)
|
||||
else:
|
||||
tgt_shape_1.append(x_shape[i])
|
||||
tgt_shape_2.append(x_shape[i])
|
||||
x = x.view(*tgt_shape_1)
|
||||
x = x[:, :, 1:, :]
|
||||
x = x.view(*tgt_shape_2)
|
||||
return x
|
||||
|
||||
def forward(self,
|
||||
hidden_states,
|
||||
ltor_mask,
|
||||
position_embeddings=None,
|
||||
r_w_bias=None,
|
||||
r_r_bias=None,
|
||||
mem=None):
|
||||
# hidden_states: [b, s, h]
|
||||
# ltor_mask: [1, 1, s, s]
|
||||
|
||||
# Attention heads. [b, s, hp]
|
||||
query_length = hidden_states.size(1)
|
||||
|
||||
if mem is None:
|
||||
mixed_x_layer = self.query_key_value(hidden_states)
|
||||
(mixed_query_layer, mixed_key_layer,
|
||||
mixed_value_layer) = split_tensor_along_last_dim(
|
||||
mixed_x_layer, 3)
|
||||
else:
|
||||
cat = torch.cat((mem, hidden_states), 1)
|
||||
mixed_x_layer = self.query_key_value(cat)
|
||||
(mixed_query_layer, mixed_key_layer,
|
||||
mixed_value_layer) = split_tensor_along_last_dim(
|
||||
mixed_x_layer, 3)
|
||||
mixed_query_layer = mixed_query_layer[:, -query_length:]
|
||||
|
||||
# Reshape and transpose [b, np, s, hn]
|
||||
query_layer = self._transpose_for_scores(mixed_query_layer)
|
||||
key_layer = self._transpose_for_scores(mixed_key_layer)
|
||||
value_layer = self._transpose_for_scores(mixed_value_layer)
|
||||
if self.relative_encoding:
|
||||
relative_layer = self.relative(position_embeddings)
|
||||
relative_layer = self._transpose_for_scores(
|
||||
relative_layer) # 1 (bsz) x n_head x klen x d_head
|
||||
# Raw attention scores. [b, np, qs, ks]
|
||||
rw_head_q = query_layer + r_w_bias.unsqueeze(1)
|
||||
ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2))
|
||||
rr_head_q = query_layer + r_r_bias.unsqueeze(1)
|
||||
bd_score = torch.matmul(rr_head_q,
|
||||
relative_layer.transpose(-1, -2))
|
||||
bd_score = self._rel_shift(bd_score) # qlen x klen x bsz x n_head
|
||||
# bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen
|
||||
|
||||
attention_scores = ac_score + bd_score
|
||||
else:
|
||||
# Raw attention scores. [b, np, s, s]
|
||||
attention_scores = torch.matmul(query_layer,
|
||||
key_layer.transpose(-1, -2))
|
||||
attention_scores = attention_scores / math.sqrt(
|
||||
self.hidden_size_per_attention_head)
|
||||
# Apply the left to right attention mask.
|
||||
attention_scores = torch.mul(attention_scores, ltor_mask) - \
|
||||
10000.0 * (1.0 - ltor_mask) # noqa
|
||||
|
||||
# Attention probabilities. [b, np, s, s]
|
||||
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
with get_cuda_rng_tracker().fork():
|
||||
attention_probs = self.attention_dropout(attention_probs)
|
||||
|
||||
# Context layer.
|
||||
# [b, np, s, hn]
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
# [b, s, np, hn]
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + \
|
||||
(self.hidden_size_per_partition,) # noqa
|
||||
# [b, s, hp]
|
||||
context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
# Output. [b, s, h]
|
||||
output = self.dense(context_layer)
|
||||
output = self.output_dropout(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def gelu_impl(x):
|
||||
"""OpenAI's gelu implementation."""
|
||||
return 0.5 * x * (
|
||||
1.0 + torch.tanh(0.7978845608028654 * x * # noqa
|
||||
(1.0 + 0.044715 * x * x))) # noqa
|
||||
|
||||
|
||||
def gelu(x):
|
||||
return gelu_impl(x)
|
||||
|
||||
|
||||
class GPT2ParallelMLP(torch.nn.Module):
|
||||
"""MLP for GPT2.
|
||||
|
||||
MLP will take the input with h hidden state, project it to 4*h
|
||||
hidden dimension, perform gelu transformation, and project the
|
||||
state back into h hidden dimension. At the end, dropout is also
|
||||
applied.
|
||||
|
||||
Arguments:
|
||||
hidden_size: The hidden size of the self attention.
|
||||
output_dropout_prob: dropout probability for the outputs
|
||||
after self attention and final output.
|
||||
init_method: initialization method used for the weights. Note
|
||||
that all biases are initialized to zero and
|
||||
layernorm weight are initialized to one.
|
||||
output_layer_init_method: output layer initialization. If None,
|
||||
use `init_method`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
output_dropout_prob,
|
||||
init_method,
|
||||
output_layer_init_method=None):
|
||||
super(GPT2ParallelMLP, self).__init__()
|
||||
# Set output layer initialization if not provided.
|
||||
if output_layer_init_method is None:
|
||||
output_layer_init_method = init_method
|
||||
# Project to 4h.
|
||||
self.dense_h_to_4h = ColumnParallelLinear(
|
||||
hidden_size,
|
||||
4 * hidden_size,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
# Project back to h.
|
||||
self.dense_4h_to_h = RowParallelLinear(
|
||||
4 * hidden_size,
|
||||
hidden_size,
|
||||
input_is_parallel=True,
|
||||
init_method=output_layer_init_method)
|
||||
self.dropout = torch.nn.Dropout(output_dropout_prob)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
# [b, s, 4hp]
|
||||
intermediate_parallel = self.dense_h_to_4h(hidden_states)
|
||||
intermediate_parallel = gelu(intermediate_parallel)
|
||||
|
||||
# [b, s, h]
|
||||
output = self.dense_4h_to_h(intermediate_parallel)
|
||||
output = self.dropout(output)
|
||||
return output
|
||||
|
||||
|
||||
class GPT2ParallelTransformerLayer(torch.nn.Module):
|
||||
"""A single layer transformer for GPT2.
|
||||
|
||||
We use the following notation:
|
||||
h: hidden size
|
||||
n: number of attention heads
|
||||
b: batch size
|
||||
s: sequence length
|
||||
Transformore layer takes input with size [b, s, h] and returns an
|
||||
output of the same size.
|
||||
|
||||
Arguments:
|
||||
hidden_size: The hidden size of the self attention.
|
||||
num_attention_heads: number of attention head in the self
|
||||
attention.
|
||||
attention_dropout_prob: dropout probability of the attention
|
||||
score in self attention.
|
||||
output_dropout_prob: dropout probability for the outputs
|
||||
after self attention and final output.
|
||||
layernorm_epsilon: epsilon used in layernorm to avoid
|
||||
division by zero.
|
||||
init_method: initialization method used for the weights. Note
|
||||
that all biases are initialized to zero and
|
||||
layernorm weight are initialized to one.
|
||||
output_layer_init_method: output layers (attention output and
|
||||
mlp output) initialization. If None,
|
||||
use `init_method`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
layernorm_epsilon,
|
||||
init_method,
|
||||
output_layer_init_method=None,
|
||||
relative_encoding=False):
|
||||
super(GPT2ParallelTransformerLayer, self).__init__()
|
||||
# Set output layer initialization if not provided.
|
||||
if output_layer_init_method is None:
|
||||
output_layer_init_method = init_method
|
||||
|
||||
# Layernorm on the input data.
|
||||
self.input_layernorm = torch.nn.LayerNorm(
|
||||
hidden_size, eps=layernorm_epsilon)
|
||||
|
||||
# Self attention.
|
||||
self.attention = GPT2ParallelSelfAttention(
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
init_method,
|
||||
output_layer_init_method=output_layer_init_method,
|
||||
relative_encoding=relative_encoding)
|
||||
|
||||
# Layernorm on the input data.
|
||||
self.post_attention_layernorm = torch.nn.LayerNorm(
|
||||
hidden_size, eps=layernorm_epsilon)
|
||||
|
||||
# MLP
|
||||
self.mlp = GPT2ParallelMLP(
|
||||
hidden_size,
|
||||
output_dropout_prob,
|
||||
init_method,
|
||||
output_layer_init_method=output_layer_init_method)
|
||||
|
||||
def forward(self,
|
||||
hidden_states,
|
||||
ltor_mask,
|
||||
position_embeddings=None,
|
||||
r_w_bias=None,
|
||||
r_r_bias=None,
|
||||
mem=None):
|
||||
# hidden_states: [b, s, h]
|
||||
# ltor_mask: [1, 1, s, s]
|
||||
|
||||
# Layer norm at the begining of the transformer layer.
|
||||
layernorm_output = self.input_layernorm(hidden_states)
|
||||
mem = self.input_layernorm(mem) if mem is not None else None
|
||||
# Self attention.
|
||||
attention_output = self.attention(layernorm_output, ltor_mask,
|
||||
position_embeddings, r_w_bias,
|
||||
r_r_bias, mem)
|
||||
# Residual connection.
|
||||
layernorm_input = hidden_states + attention_output
|
||||
# Layer norm post the self attention.
|
||||
layernorm_output = self.post_attention_layernorm(layernorm_input)
|
||||
# MLP.
|
||||
mlp_output = self.mlp(layernorm_output)
|
||||
# Second residual connection.
|
||||
output = layernorm_input + mlp_output
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def unscaled_init_method(sigma):
|
||||
"""Init method based on N(0, sigma)."""
|
||||
|
||||
def init_(tensor):
|
||||
return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
|
||||
|
||||
return init_
|
||||
|
||||
|
||||
def scaled_init_method(sigma, num_layers):
|
||||
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
|
||||
std = sigma / math.sqrt(2.0 * num_layers)
|
||||
|
||||
def init_(tensor):
|
||||
return torch.nn.init.normal_(tensor, mean=0.0, std=std)
|
||||
|
||||
return init_
|
||||
|
||||
|
||||
class GPT2ParallelTransformer(torch.nn.Module):
|
||||
"""GPT-2 transformer.
|
||||
|
||||
This module takes input from embedding layer and it's output can
|
||||
be used directly by a logit layer. It consists of L (num-layers)
|
||||
blocks of:
|
||||
layer norm
|
||||
self attention
|
||||
residual connection
|
||||
layer norm
|
||||
mlp
|
||||
residual connection
|
||||
followed by a final layer norm.
|
||||
|
||||
Arguments:
|
||||
num_layers: Number of transformer layers.
|
||||
hidden_size: The hidden size of the self attention.
|
||||
num_attention_heads: number of attention head in the self
|
||||
attention.
|
||||
attention_dropout_prob: dropout probability of the attention
|
||||
score in self attention.
|
||||
output_dropout_prob: dropout probability for the outputs
|
||||
after self attention and final output.
|
||||
checkpoint_activations: if True, checkpoint activations.
|
||||
checkpoint_num_layers: number of layers to checkpoint. This
|
||||
is basically the chunk size in checkpoitning.
|
||||
layernorm_epsilon: epsilon used in layernorm to avoid
|
||||
division by zero.
|
||||
init_method_std: standard deviation of the init method which has
|
||||
the form N(0, std).
|
||||
use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
|
||||
scaling for the output weights (
|
||||
output of self attention and mlp).
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
max_sequence_length,
|
||||
max_memory_length,
|
||||
embedding_dropout_prob,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
checkpoint_activations,
|
||||
checkpoint_num_layers=1,
|
||||
layernorm_epsilon=1.0e-5,
|
||||
init_method_std=0.02,
|
||||
use_scaled_init_for_output_weights=True,
|
||||
relative_encoding=False):
|
||||
super(GPT2ParallelTransformer, self).__init__()
|
||||
# Store activation checkpoiting flag.
|
||||
self.checkpoint_activations = checkpoint_activations
|
||||
self.checkpoint_num_layers = checkpoint_num_layers
|
||||
self.max_memory_length = max_memory_length
|
||||
|
||||
output_layer_init_method = None
|
||||
if use_scaled_init_for_output_weights:
|
||||
output_layer_init_method = scaled_init_method(
|
||||
init_method_std, num_layers)
|
||||
# Embeddings dropout
|
||||
self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
|
||||
self.relative_encoding = relative_encoding
|
||||
if relative_encoding:
|
||||
# Relative position embedding
|
||||
self.position_embeddings = PositionalEmbedding(hidden_size)
|
||||
# Per attention head and per partition values.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.hidden_size_per_attention_head = divide(
|
||||
hidden_size, num_attention_heads)
|
||||
self.num_attention_heads_per_partition = divide(
|
||||
num_attention_heads, world_size)
|
||||
self.r_w_bias = torch.nn.Parameter(
|
||||
torch.Tensor(self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head))
|
||||
self.r_w_bias.model_parallel = True
|
||||
self.r_r_bias = torch.nn.Parameter(
|
||||
torch.Tensor(self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head))
|
||||
self.r_r_bias.model_parallel = True
|
||||
# Always initialize bias to zero.
|
||||
with torch.no_grad():
|
||||
self.r_w_bias.zero_()
|
||||
self.r_r_bias.zero_()
|
||||
else:
|
||||
# Position embedding (serial).
|
||||
self.position_embeddings = torch.nn.Embedding(
|
||||
max_sequence_length, hidden_size)
|
||||
# Initialize the position embeddings.
|
||||
torch.nn.init.normal_(
|
||||
self.position_embeddings.weight, mean=0.0, std=init_method_std)
|
||||
|
||||
def get_layer():
|
||||
return GPT2ParallelTransformerLayer(
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
layernorm_epsilon,
|
||||
unscaled_init_method(init_method_std),
|
||||
output_layer_init_method=output_layer_init_method,
|
||||
relative_encoding=relative_encoding)
|
||||
|
||||
# Transformer layers.
|
||||
self.layers = torch.nn.ModuleList(
|
||||
[get_layer() for _ in range(num_layers)])
|
||||
|
||||
# Final layer norm before output.
|
||||
self.final_layernorm = torch.nn.LayerNorm(
|
||||
hidden_size, eps=layernorm_epsilon)
|
||||
|
||||
if deepspeed.checkpointing.is_configured():
|
||||
global get_cuda_rng_tracker, checkpoint
|
||||
get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
|
||||
checkpoint = deepspeed.checkpointing.checkpoint
|
||||
|
||||
def forward(self, hidden_states, position_ids, attention_mask, *mems):
|
||||
batch_size, query_length = hidden_states.size()[:2]
|
||||
memory_length = mems[0].size(1) if mems else 0
|
||||
key_length = query_length + memory_length
|
||||
attention_mask = attention_mask[:, :, :,
|
||||
-query_length - memory_length:]
|
||||
if self.relative_encoding:
|
||||
hidden_states = self.embedding_dropout(hidden_states)
|
||||
position_sequence = torch.arange(
|
||||
key_length - 1,
|
||||
-1,
|
||||
-1.0,
|
||||
device=hidden_states.device,
|
||||
dtype=hidden_states.dtype)
|
||||
position_embeddings = self.position_embeddings(position_sequence)
|
||||
# Apply dropout
|
||||
position_embeddings = self.embedding_dropout(position_embeddings)
|
||||
hidden_states = self.embedding_dropout(hidden_states)
|
||||
else:
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
hidden_states = hidden_states + position_embeddings
|
||||
hidden_states = self.embedding_dropout(hidden_states)
|
||||
if self.max_memory_length > 0:
|
||||
mem_layers = [hidden_states.detach()]
|
||||
else:
|
||||
mem_layers = []
|
||||
|
||||
def custom(start, end):
|
||||
|
||||
def custom_forward(*inputs):
|
||||
layers_ = self.layers[start:end]
|
||||
x_, inputs = inputs[0], inputs[1:]
|
||||
if self.relative_encoding:
|
||||
inputs, mems_ = inputs[:4], inputs[4:]
|
||||
else:
|
||||
inputs, mems_ = inputs[:1], inputs[1:]
|
||||
for i, layer in enumerate(layers_):
|
||||
mem_i_ = mems_[i] if mems_ else None
|
||||
x_ = layer(x_, *inputs, mem=mem_i_)
|
||||
if self.max_memory_length > 0:
|
||||
mem_layers.append(x_.detach())
|
||||
return x_
|
||||
|
||||
return custom_forward
|
||||
|
||||
if self.checkpoint_activations:
|
||||
l = 0 # noqa
|
||||
num_layers = len(self.layers)
|
||||
chunk_length = self.checkpoint_num_layers
|
||||
while l < num_layers:
|
||||
args = [hidden_states, attention_mask]
|
||||
if self.relative_encoding:
|
||||
args += [position_embeddings, self.r_w_bias, self.r_r_bias]
|
||||
if mems:
|
||||
args += mems[l:l + chunk_length]
|
||||
hidden_states = checkpoint(custom(l, l + chunk_length),
|
||||
*args) # noqa
|
||||
l += chunk_length # noqa
|
||||
else:
|
||||
for i, layer in enumerate(self.layers):
|
||||
args = [hidden_states, attention_mask]
|
||||
if self.relative_encoding:
|
||||
args += [position_embeddings, self.r_w_bias, self.r_r_bias]
|
||||
mem_i = mems[i] if mems else None
|
||||
hidden_states = layer(*args, mem=mem_i)
|
||||
if self.max_memory_length > 0:
|
||||
mem_layers.append(hidden_states.detach())
|
||||
|
||||
# Final layer norm.
|
||||
output = self.final_layernorm(hidden_states)
|
||||
if self.max_memory_length > 0:
|
||||
mem_layers = self.update_mems(mem_layers, mems)
|
||||
|
||||
return (output, *mem_layers)
|
||||
|
||||
def update_mems(self, hiddens, mems):
|
||||
memory_length = mems[0].size(1) if mems else 0
|
||||
query_length = hiddens[0].size(1)
|
||||
new_memory_length = min(self.max_memory_length,
|
||||
memory_length + query_length)
|
||||
new_mems = []
|
||||
with torch.no_grad():
|
||||
for i in range(len(hiddens)):
|
||||
if new_memory_length <= query_length:
|
||||
new_mems.append(hiddens[i][:, -new_memory_length:])
|
||||
else:
|
||||
new_mems.append(
|
||||
torch.cat(
|
||||
(mems[i][:, -new_memory_length + query_length:],
|
||||
hiddens[i]),
|
||||
dim=1))
|
||||
return new_mems
|
||||
|
||||
|
||||
class BertParallelSelfAttention(torch.nn.Module):
|
||||
"""Parallel self-attention layer for BERT.
|
||||
|
||||
Self-attention layer takes input with size [b, s, h] where b is
|
||||
the batch size, s is the sequence lenght, and h is the hidden size
|
||||
and creates output of the same size.
|
||||
Arguments:
|
||||
hidden_size: total hidden size of the layer (h).
|
||||
num_attention_heads: number of attention heads (n). Note that we
|
||||
require n to be divisible by number of GPUs
|
||||
used to parallelize the model. Also, we
|
||||
require hidden size be divisible by n.
|
||||
dropout_prob: dropout probability for the attention scores.
|
||||
output_parallel: If true, no all-gather is done on the output and
|
||||
the output values will be per partition.
|
||||
We use the following notation:
|
||||
h: hidden_size
|
||||
n: num_attention_heads
|
||||
p: number of partitions
|
||||
np: n/p
|
||||
hp: h/p
|
||||
hn: h/n
|
||||
b: batch size
|
||||
s: sequence length
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
dropout_prob,
|
||||
output_parallel=False,
|
||||
init_method=init.xavier_normal_):
|
||||
super(BertParallelSelfAttention, self).__init__()
|
||||
# Input configuration.
|
||||
self.hidden_size = hidden_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.dropout_prob = dropout_prob
|
||||
self.output_parallel = output_parallel
|
||||
# Per attention head and per partition values.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.hidden_size_per_partition = divide(hidden_size, world_size)
|
||||
self.hidden_size_per_attention_head = divide(hidden_size,
|
||||
num_attention_heads)
|
||||
self.num_attention_heads_per_partition = divide(
|
||||
num_attention_heads, world_size)
|
||||
# Strided linear layer.
|
||||
self.query_key_value = ColumnParallelLinear(
|
||||
hidden_size,
|
||||
3 * hidden_size,
|
||||
stride=3,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
# Dropout. Note that for a single iteration, this layer will generate
|
||||
# different outputs on different number of parallel partitions but
|
||||
# on average it should not be partition dependent.
|
||||
self.dropout = torch.nn.Dropout(dropout_prob)
|
||||
|
||||
if deepspeed.checkpointing.is_configured():
|
||||
global get_cuda_rng_tracker, checkpoint
|
||||
get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
|
||||
checkpoint = deepspeed.checkpointing.checkpoint
|
||||
|
||||
def _transpose_for_scores(self, tensor):
|
||||
"""Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
|
||||
size [b, np, s, hn].
|
||||
""" # noqa
|
||||
new_tensor_shape = tensor.size()[:-1] + (
|
||||
self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head)
|
||||
tensor = tensor.view(*new_tensor_shape)
|
||||
return tensor.permute(0, 2, 1, 3)
|
||||
|
||||
def forward(self, hidden_states, attention_mask):
|
||||
|
||||
# Attention heads. [b, s, hp]
|
||||
mixed_x_layer = self.query_key_value(hidden_states)
|
||||
(mixed_query_layer, mixed_key_layer,
|
||||
mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
|
||||
|
||||
# Reshape and transpose [b, np, s, hn]
|
||||
query_layer = self._transpose_for_scores(mixed_query_layer)
|
||||
key_layer = self._transpose_for_scores(mixed_key_layer)
|
||||
value_layer = self._transpose_for_scores(mixed_value_layer)
|
||||
|
||||
# Raw attention scores. [b, np, s, s]
|
||||
norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
|
||||
attention_scores = torch.matmul(
|
||||
query_layer / norm_factor,
|
||||
key_layer.transpose(-1, -2) / norm_factor)
|
||||
# Apply the attention mask.
|
||||
attention_scores += attention_mask
|
||||
|
||||
# Attention probabilities. [b, np, s, s]
|
||||
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
with get_cuda_rng_tracker().fork():
|
||||
attention_probs = self.dropout(attention_probs)
|
||||
|
||||
# Context layer.
|
||||
# [b, np, s, hn]
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
# [b, s, np, hn]
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + (
|
||||
self.hidden_size_per_partition, ) # noqa
|
||||
# [b, s, hp]
|
||||
context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
# Output. [b, s, h]
|
||||
if self.output_parallel:
|
||||
output = context_layer
|
||||
else:
|
||||
output = gather_from_model_parallel_region(context_layer)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class BertParallelTransformerOutput(torch.nn.Module):
|
||||
"""The output layer used after self attention and intermediate
|
||||
parts of transformer layer."""
|
||||
|
||||
def __init__(self,
|
||||
input_size,
|
||||
output_size,
|
||||
dropout_prob,
|
||||
layernorm_epsilon=1.0e-12,
|
||||
input_is_parallel=False,
|
||||
init_method=init.xavier_normal_):
|
||||
super(BertParallelTransformerOutput, self).__init__()
|
||||
# Components.
|
||||
self.dense = RowParallelLinear(
|
||||
input_size,
|
||||
output_size,
|
||||
input_is_parallel=input_is_parallel,
|
||||
init_method=init_method)
|
||||
self.dropout = torch.nn.Dropout(dropout_prob)
|
||||
self.layernorm = torch.nn.LayerNorm(output_size, eps=layernorm_epsilon)
|
||||
|
||||
def forward(self, hidden_states, input_tensor):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
layernorm_input = hidden_states + input_tensor
|
||||
hidden_states = self.layernorm(layernorm_input)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BertParallelTransformerLayer(torch.nn.Module):
|
||||
"""A single layer transformer for Bert.
|
||||
|
||||
We use the following notation:
|
||||
h: hidden size
|
||||
n: number of attention heads
|
||||
b: batch size
|
||||
s: sequence length
|
||||
Transformore layer takes input with size [b, s, h] and returns an
|
||||
output of the same size.
|
||||
|
||||
Arguments:
|
||||
hidden_size: The hidden size of the self attention.
|
||||
intermediate_size: size of the intermediate state after
|
||||
self attention. In both BERT and GPT
|
||||
this is set to be 4 times the hidden
|
||||
size.
|
||||
num_attention_heads: number of attention head in the self
|
||||
attention.
|
||||
attention_dropout_prob: dropout probability of the attention
|
||||
score in self attention.
|
||||
output_dropout_prob: dropout probability for the outputs
|
||||
after self attention and final output.
|
||||
intermediate_activation_fn: activation function for output
|
||||
of intermediate.
|
||||
layernorm_epsilon: epsilon used in layernorm to avoid
|
||||
division by zero.
|
||||
init_method: initialization method used for the weights. Note
|
||||
that all biases are initialized to zero and
|
||||
layernorm weight are initialized to one.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_dropout_prob,
|
||||
intermediate_activation_fn,
|
||||
layernorm_epsilon,
|
||||
init_method=init.xavier_normal_):
|
||||
super(BertParallelTransformerLayer, self).__init__()
|
||||
|
||||
# Self attention.
|
||||
self.attention = BertParallelSelfAttention(
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_dropout_prob,
|
||||
output_parallel=True,
|
||||
init_method=init_method)
|
||||
# Self attention output.
|
||||
self.self_output = BertParallelTransformerOutput(
|
||||
hidden_size,
|
||||
hidden_size,
|
||||
output_dropout_prob,
|
||||
layernorm_epsilon=layernorm_epsilon,
|
||||
input_is_parallel=True,
|
||||
init_method=init_method)
|
||||
# Intermediate.
|
||||
self.intermediate = ColumnParallelLinear(
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
gather_output=False,
|
||||
init_method=init_method)
|
||||
self.intermediate_activation_fn = intermediate_activation_fn
|
||||
# Output.
|
||||
self.output = BertParallelTransformerOutput(
|
||||
intermediate_size,
|
||||
hidden_size,
|
||||
output_dropout_prob,
|
||||
layernorm_epsilon=layernorm_epsilon,
|
||||
input_is_parallel=True,
|
||||
init_method=init_method)
|
||||
|
||||
def forward(self, hidden_states, attention_mask):
|
||||
# [b, s, hp]
|
||||
attention_output_parallel = self.attention(hidden_states,
|
||||
attention_mask)
|
||||
# [b, s, h]
|
||||
attention_self_output = self.self_output(attention_output_parallel,
|
||||
hidden_states)
|
||||
# [b, s, ip]
|
||||
intermediate_output_parallel = self.intermediate(attention_self_output)
|
||||
intermediate_output_parallel = self.intermediate_activation_fn(
|
||||
intermediate_output_parallel)
|
||||
# [b, s, h]
|
||||
layer_output = self.output(intermediate_output_parallel,
|
||||
attention_self_output)
|
||||
|
||||
return layer_output
|
||||
70
modelscope/models/nlp/txl_poem/gpt2/mpu/utils.py
Executable file
70
modelscope/models/nlp/txl_poem/gpt2/mpu/utils.py
Executable file
@@ -0,0 +1,70 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def ensure_divisibility(numerator, denominator):
|
||||
"""Ensure that numerator is divisible by the denominator."""
|
||||
assert numerator % denominator == 0, '{} is not divisible by {}'.format(
|
||||
numerator, denominator)
|
||||
|
||||
|
||||
def divide(numerator, denominator):
|
||||
"""Ensure that numerator is divisible by the denominator and return
|
||||
the division value."""
|
||||
ensure_divisibility(numerator, denominator)
|
||||
return numerator // denominator
|
||||
|
||||
|
||||
def split_tensor_along_last_dim(tensor,
|
||||
num_partitions,
|
||||
contiguous_split_chunks=False):
|
||||
"""Split a tensor along its last dimension.
|
||||
Arguments:
|
||||
tensor: input tensor.
|
||||
num_partitions: number of partitions to split the tensor
|
||||
contiguous_split_chunks: If True, make each chunk contiguous
|
||||
in memory.
|
||||
"""
|
||||
# Get the size and dimension.
|
||||
last_dim = tensor.dim() - 1
|
||||
last_dim_size = divide(tensor.size()[last_dim], num_partitions)
|
||||
# Split.
|
||||
tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
|
||||
# Note: torch.split does not create contiguous tensors by default.
|
||||
if contiguous_split_chunks:
|
||||
return tuple(chunk.contiguous() for chunk in tensor_list)
|
||||
|
||||
return tensor_list
|
||||
|
||||
|
||||
class VocabUtility:
|
||||
"""Split the vocabulary into `world_size` chunks amd return the
|
||||
first and last index of the vocabulary belonging to the `rank`
|
||||
partition: Note that indecies in [fist, last)"""
|
||||
|
||||
@staticmethod
|
||||
def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
|
||||
rank, world_size):
|
||||
index_f = rank * per_partition_vocab_size
|
||||
index_l = index_f + per_partition_vocab_size
|
||||
return index_f, index_l
|
||||
|
||||
@staticmethod
|
||||
def vocab_range_from_global_vocab_size(global_vocab_size, rank,
|
||||
world_size):
|
||||
per_partition_vocab_size = divide(global_vocab_size, world_size)
|
||||
return VocabUtility.vocab_range_from_per_partition_vocab_size(
|
||||
per_partition_vocab_size, rank, world_size)
|
||||
440
modelscope/models/nlp/txl_poem/gpt2/utils.py
Executable file
440
modelscope/models/nlp/txl_poem/gpt2/utils.py
Executable file
@@ -0,0 +1,440 @@
|
||||
# Modified by Zhipu.AI
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Utilities for logging and serialization"""
|
||||
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
|
||||
|
||||
from . import model, mpu
|
||||
from .fp16 import FP16_Optimizer
|
||||
|
||||
SUMMARY_WRITER_DIR_NAME = 'runs'
|
||||
|
||||
|
||||
def print_rank_0(message):
|
||||
if torch.distributed.is_initialized():
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(message, flush=True)
|
||||
else:
|
||||
print(message, flush=True)
|
||||
|
||||
|
||||
def print_args(args):
|
||||
"""Print arguments."""
|
||||
|
||||
print('arguments:', flush=True)
|
||||
for arg in vars(args):
|
||||
dots = '.' * (29 - len(arg))
|
||||
print(' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
|
||||
|
||||
|
||||
def print_params_min_max_norm(optimizer, iteration):
|
||||
"""Print min, max, and norm of all parameters."""
|
||||
index = 0
|
||||
rank = torch.distributed.get_rank()
|
||||
string = 'iteration, rank, index, model-parallel,min, max, norm\n'
|
||||
optimizer_ = optimizer
|
||||
if isinstance(optimizer, FP16_Optimizer):
|
||||
optimizer_ = optimizer.optimizer
|
||||
for param_group in optimizer_.param_groups:
|
||||
for param in param_group['params']:
|
||||
index += 1
|
||||
min_ = param.data.min()
|
||||
max_ = param.data.max()
|
||||
norm = param.data.norm()
|
||||
string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
|
||||
iteration, rank, index, int(param.model_parallel))
|
||||
string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
|
||||
print(string, flush=True)
|
||||
|
||||
|
||||
class Timers:
|
||||
"""Group of timers."""
|
||||
|
||||
class Timer:
|
||||
"""Timer."""
|
||||
|
||||
def __init__(self, name):
|
||||
self.name_ = name
|
||||
self.elapsed_ = 0.0
|
||||
self.started_ = False
|
||||
self.start_time = time.time()
|
||||
|
||||
def start(self):
|
||||
"""Start the timer."""
|
||||
assert not self.started_, 'timer has already been started'
|
||||
torch.cuda.synchronize()
|
||||
self.start_time = time.time()
|
||||
self.started_ = True
|
||||
|
||||
def stop(self):
|
||||
"""Stop the timer."""
|
||||
assert self.started_, 'timer is not started'
|
||||
torch.cuda.synchronize()
|
||||
self.elapsed_ += (time.time() - self.start_time)
|
||||
self.started_ = False
|
||||
|
||||
def reset(self):
|
||||
"""Reset timer."""
|
||||
self.elapsed_ = 0.0
|
||||
self.started_ = False
|
||||
|
||||
def elapsed(self, reset=True):
|
||||
"""Calculate the elapsed time."""
|
||||
started_ = self.started_
|
||||
# If the timing in progress, end it first.
|
||||
if self.started_:
|
||||
self.stop()
|
||||
# Get the elapsed time.
|
||||
elapsed_ = self.elapsed_
|
||||
# Reset the elapsed time
|
||||
if reset:
|
||||
self.reset()
|
||||
# If timing was in progress, set it back.
|
||||
if started_:
|
||||
self.start()
|
||||
return elapsed_
|
||||
|
||||
def __init__(self):
|
||||
self.timers = {}
|
||||
|
||||
def __call__(self, name):
|
||||
if name not in self.timers:
|
||||
self.timers[name] = self.Timer(name)
|
||||
return self.timers[name]
|
||||
|
||||
def log(self, names, normalizer=1.0, reset=True):
|
||||
"""Log a group of timers."""
|
||||
assert normalizer > 0.0
|
||||
string = 'time (ms)'
|
||||
for name in names:
|
||||
elapsed_time = self.timers[name].elapsed(
|
||||
reset=reset) * 1000.0 / normalizer
|
||||
string += ' | {}: {:.2f}'.format(name, elapsed_time)
|
||||
print_rank_0(string)
|
||||
|
||||
|
||||
def report_memory(name):
|
||||
"""Simple GPU memory report."""
|
||||
|
||||
mega_bytes = 1024.0 * 1024.0
|
||||
string = name + ' memory (MB)'
|
||||
string += ' | allocated: {}'.format(torch.cuda.memory_allocated()
|
||||
/ mega_bytes)
|
||||
string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated()
|
||||
/ mega_bytes)
|
||||
string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
|
||||
string += ' | max cached: {}'.format(torch.cuda.memory_reserved()
|
||||
/ mega_bytes)
|
||||
print_rank_0(string)
|
||||
|
||||
|
||||
def get_checkpoint_name(checkpoints_path,
|
||||
iteration,
|
||||
release=False,
|
||||
zero=False):
|
||||
if release:
|
||||
d = 'release'
|
||||
else:
|
||||
d = 'iter_{:07d}'.format(iteration)
|
||||
if zero:
|
||||
dp_rank = mpu.get_data_parallel_rank()
|
||||
d += '_zero_dp_rank_{}'.format(dp_rank)
|
||||
return os.path.join(checkpoints_path, d,
|
||||
'mp_rank_{:02d}'.format(mpu.get_model_parallel_rank()),
|
||||
'model_optim_rng.pt')
|
||||
|
||||
|
||||
def ensure_directory_exists(filename):
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
|
||||
|
||||
def get_checkpoint_tracker_filename(checkpoints_path):
|
||||
return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
|
||||
|
||||
|
||||
def save_zero_checkpoint(args, iteration, optimizer):
|
||||
zero_sd = {
|
||||
'iteration': iteration,
|
||||
'optimizer_state_dict': optimizer.state_dict()
|
||||
}
|
||||
zero_checkpoint_name = get_checkpoint_name(args.save, iteration, zero=True)
|
||||
ensure_directory_exists(zero_checkpoint_name)
|
||||
torch.save(zero_sd, zero_checkpoint_name)
|
||||
print(' successfully saved {}'.format(zero_checkpoint_name))
|
||||
|
||||
|
||||
def save_checkpoint(iteration, model, optimizer, lr_scheduler, args): # noqa
|
||||
"""Save a model checkpoint."""
|
||||
if args.deepspeed:
|
||||
save_ds_checkpoint(iteration, model, lr_scheduler, args)
|
||||
else:
|
||||
# Only rank zer0 of the data parallel writes to the disk.
|
||||
if isinstance(model, torchDDP):
|
||||
model = model.module
|
||||
|
||||
if mpu.get_data_parallel_rank() == 0:
|
||||
checkpoint_name = get_checkpoint_name(args.save, iteration)
|
||||
print(
|
||||
'global rank {} is saving checkpoint at iteration {:7d} to {}'.
|
||||
format(torch.distributed.get_rank(), iteration,
|
||||
checkpoint_name))
|
||||
|
||||
sd = {}
|
||||
sd['iteration'] = iteration
|
||||
sd['model'] = model.state_dict()
|
||||
|
||||
# Optimizer stuff.
|
||||
if not args.no_save_optim:
|
||||
if optimizer is not None:
|
||||
sd['optimizer'] = optimizer.state_dict()
|
||||
if lr_scheduler is not None:
|
||||
sd['lr_scheduler'] = lr_scheduler.state_dict()
|
||||
|
||||
# rng states.
|
||||
if not args.no_save_rng:
|
||||
sd['random_rng_state'] = random.getstate()
|
||||
sd['np_rng_state'] = np.random.get_state()
|
||||
sd['torch_rng_state'] = torch.get_rng_state()
|
||||
sd['cuda_rng_state'] = torch.cuda.get_rng_state()
|
||||
sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker(
|
||||
).get_states()
|
||||
|
||||
ensure_directory_exists(checkpoint_name)
|
||||
torch.save(sd, checkpoint_name)
|
||||
print(' successfully saved {}'.format(checkpoint_name))
|
||||
|
||||
# Wait so everyone is done (necessary)
|
||||
torch.distributed.barrier()
|
||||
# And update the latest iteration
|
||||
if torch.distributed.get_rank() == 0:
|
||||
tracker_filename = get_checkpoint_tracker_filename(args.save)
|
||||
with open(tracker_filename, 'w') as f:
|
||||
f.write(str(iteration))
|
||||
# Wait so everyone is done (not necessary)
|
||||
torch.distributed.barrier()
|
||||
|
||||
|
||||
def save_ds_checkpoint(iteration, model, lr_scheduler, args): # noqa
|
||||
"""Save a model checkpoint."""
|
||||
|
||||
sd = {}
|
||||
sd['iteration'] = iteration
|
||||
if lr_scheduler is not None:
|
||||
sd['client_lr_scheduler'] = lr_scheduler.state_dict()
|
||||
# rng states.
|
||||
if not args.no_save_rng:
|
||||
sd['random_rng_state'] = random.getstate()
|
||||
sd['np_rng_state'] = np.random.get_state()
|
||||
sd['torch_rng_state'] = torch.get_rng_state()
|
||||
sd['cuda_rng_state'] = torch.cuda.get_rng_state()
|
||||
sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
|
||||
|
||||
model.save_checkpoint(args.save, str(iteration), client_state=sd)
|
||||
|
||||
|
||||
def get_checkpoint_iteration(args):
|
||||
# Read the tracker file and set the iteration.
|
||||
tracker_filename = get_checkpoint_tracker_filename(args.load)
|
||||
if not os.path.isfile(tracker_filename):
|
||||
print_rank_0('WARNING: could not find the metadata file {} '.format(
|
||||
tracker_filename))
|
||||
print_rank_0(' will not load any checkpoints and will start from '
|
||||
'random')
|
||||
return 0, False, False
|
||||
iteration = 0
|
||||
release = False
|
||||
with open(tracker_filename, 'r') as f:
|
||||
metastring = f.read().strip()
|
||||
try:
|
||||
iteration = int(metastring)
|
||||
except ValueError:
|
||||
release = metastring == 'release'
|
||||
if not release:
|
||||
print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
|
||||
tracker_filename))
|
||||
exit()
|
||||
|
||||
assert iteration > 0 or release, 'error parsing metadata file {}'.format(
|
||||
tracker_filename)
|
||||
|
||||
return iteration, release, True
|
||||
|
||||
|
||||
def load_checkpoint(
|
||||
model, # noqa
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
args,
|
||||
load_optimizer_states=True):
|
||||
"""Load a model checkpoint."""
|
||||
|
||||
iteration, release, success = get_checkpoint_iteration(args)
|
||||
|
||||
if not success:
|
||||
return 0
|
||||
|
||||
if args.deepspeed:
|
||||
|
||||
checkpoint_name, sd = model.load_checkpoint(
|
||||
args.load, iteration, load_optimizer_states=not args.no_load_optim)
|
||||
if 'client_lr_scheduler' in sd:
|
||||
lr_scheduler.load_state_dict(sd['client_lr_scheduler'])
|
||||
print_rank_0('Load lr scheduler state')
|
||||
if checkpoint_name is None:
|
||||
if mpu.get_data_parallel_rank() == 0:
|
||||
print('Unable to load checkpoint.')
|
||||
return iteration
|
||||
|
||||
else:
|
||||
|
||||
# Checkpoint.
|
||||
checkpoint_name = get_checkpoint_name(args.load, iteration, release)
|
||||
|
||||
if mpu.get_data_parallel_rank() == 0:
|
||||
print('global rank {} is loading checkpoint {}'.format(
|
||||
torch.distributed.get_rank(), checkpoint_name))
|
||||
|
||||
# Load the checkpoint.
|
||||
sd = torch.load(checkpoint_name, map_location='cpu')
|
||||
|
||||
if isinstance(model, torchDDP):
|
||||
model = model.module
|
||||
|
||||
# Model.
|
||||
try:
|
||||
model.load_state_dict(sd['model'])
|
||||
except KeyError:
|
||||
print_rank_0('A metadata file exists but unable to load model '
|
||||
'from checkpoint {}, exiting'.format(checkpoint_name))
|
||||
exit()
|
||||
|
||||
# Optimizer.
|
||||
if not release and not args.finetune and not args.no_load_optim:
|
||||
try:
|
||||
if optimizer is not None and load_optimizer_states:
|
||||
optimizer.load_state_dict(sd['optimizer'])
|
||||
if lr_scheduler is not None:
|
||||
lr_scheduler.load_state_dict(sd['lr_scheduler'])
|
||||
except KeyError:
|
||||
print_rank_0(
|
||||
'Unable to load optimizer from checkpoint {}, exiting. '
|
||||
'Specify --no-load-optim or --finetune to prevent '
|
||||
'attempting to load the optimizer '
|
||||
'state.'.format(checkpoint_name))
|
||||
exit()
|
||||
|
||||
# Iterations.
|
||||
if args.finetune or release:
|
||||
iteration = 0
|
||||
else:
|
||||
try:
|
||||
iteration = sd['iteration']
|
||||
except KeyError:
|
||||
try: # Backward compatible with older checkpoints
|
||||
iteration = sd['total_iters']
|
||||
except KeyError:
|
||||
print_rank_0(
|
||||
'A metadata file exists but Unable to load iteration '
|
||||
' from checkpoint {}, exiting'.format(checkpoint_name))
|
||||
exit()
|
||||
|
||||
# rng states.
|
||||
if not release and not args.finetune and not args.no_load_rng:
|
||||
try:
|
||||
random.setstate(sd['random_rng_state'])
|
||||
np.random.set_state(sd['np_rng_state'])
|
||||
torch.set_rng_state(sd['torch_rng_state'])
|
||||
torch.cuda.set_rng_state(sd['cuda_rng_state'])
|
||||
mpu.get_cuda_rng_tracker().set_states(sd['rng_tracker_states'])
|
||||
except KeyError:
|
||||
print_rank_0(
|
||||
'Unable to load optimizer from checkpoint {}, exiting. '
|
||||
'Specify --no-load-rng or --finetune to prevent '
|
||||
'attempting to load the random '
|
||||
'state.'.format(checkpoint_name))
|
||||
exit()
|
||||
|
||||
if mpu.get_data_parallel_rank() == 0:
|
||||
print(' successfully loaded {}'.format(checkpoint_name))
|
||||
|
||||
return iteration
|
||||
|
||||
|
||||
def load_weights(src, dst, dst2src=False):
|
||||
"""
|
||||
Loads weights from src to dst via in place copy.
|
||||
src is a huggingface gpt2model, while dst is one of our models.
|
||||
dst2src=True loads parameters from our models into huggingface's.
|
||||
^dst2src is still untested
|
||||
"""
|
||||
conv_layer = 'Conv1D' in str(type(src))
|
||||
for n, p in src.named_parameters():
|
||||
if dst2src:
|
||||
data = dst._parameters[n].data
|
||||
load = p.data
|
||||
else:
|
||||
data = p.data
|
||||
load = dst._parameters[n].data
|
||||
if conv_layer and 'weight' in n:
|
||||
data = data.t().contiguous()
|
||||
load.copy_(data)
|
||||
|
||||
|
||||
# dst._parameters[n].data.copy_(data)
|
||||
|
||||
|
||||
def load_mlp(our, oai, dst2src=False):
|
||||
load_weights(oai.c_fc, our.dense_h_to_4h, dst2src)
|
||||
load_weights(oai.c_proj, our.dense_4h_to_h, dst2src)
|
||||
|
||||
|
||||
def load_attention(our, oai, dst2src=False):
|
||||
load_weights(oai.c_attn, our.query_key_value, dst2src)
|
||||
load_weights(oai.c_proj, our.dense, dst2src)
|
||||
|
||||
|
||||
def load_transformer_layer(our, oai, dst2src=False):
|
||||
load_weights(oai.ln_1, our.input_layernorm, dst2src)
|
||||
load_weights(oai.ln_2, our.post_attention_layernorm, dst2src)
|
||||
load_mlp(our.mlp, oai.mlp, dst2src)
|
||||
load_attention(our.attention, oai.attn, dst2src)
|
||||
|
||||
|
||||
def move_weights(our, oai, dst2src=False):
|
||||
"""
|
||||
Loads weights from `oai` to `our` via in place copy.
|
||||
`oai` is a huggingface gpt2model, while `our` is one of our models.
|
||||
dst2src=True loads parameters from our models into huggingface's.
|
||||
^dst2src=True is still untested
|
||||
"""
|
||||
# while isinstance(our, (torchDDP, model.distributed.DistributedDataParallel, FP16_Module)):
|
||||
# our=our.module
|
||||
transformer_model = oai.transformer
|
||||
load_weights(transformer_model.ln_f, our.transformer.final_layernorm,
|
||||
dst2src)
|
||||
load_weights(transformer_model.wte, our.word_embeddings, dst2src)
|
||||
load_weights(transformer_model.wpe, our.position_embeddings, dst2src)
|
||||
|
||||
for our_layer, oai_layer in zip(our.transformer.layers, oai.transformer.h):
|
||||
load_transformer_layer(our_layer, oai_layer, dst2src)
|
||||
33
modelscope/models/nlp/txl_poem/txl_for_fast_poem.py
Normal file
33
modelscope/models/nlp/txl_poem/txl_for_fast_poem.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# Copyright (c) 2022 Zhipu.AI
|
||||
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import Tensor, TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from .fastpoem import fast_poem, prepare_model
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.fast_poem, module_name=Models.txl)
|
||||
class TXLForFastPoem(TorchModel):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the fast poem model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
|
||||
# initialize model
|
||||
self.model, self.tokenizer, self.args = prepare_model(model_dir)
|
||||
|
||||
def forward(self, input: Dict[str, str]) -> Dict[str, str]:
|
||||
pass
|
||||
|
||||
def generate(self, input: Dict[str, str]) -> Dict[str, str]:
|
||||
res = fast_poem(input, self.model, self.tokenizer, self.args)
|
||||
return {OutputKeys.TEXT: res['text']}
|
||||
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
||||
from .word_segmentation_pipeline import WordSegmentationPipeline
|
||||
from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
|
||||
from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline
|
||||
from .txl_fast_poem_pipeline import TXLFastPoemPipeline
|
||||
from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \
|
||||
WordSegmentationThaiPipeline
|
||||
|
||||
@@ -73,6 +74,7 @@ else:
|
||||
'zero_shot_classification_pipeline':
|
||||
['ZeroShotClassificationPipeline'],
|
||||
'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'],
|
||||
'txl_fast_poem_pipeline': ['TXLFastPoemPipeline'],
|
||||
'multilingual_word_segmentation_pipeline': [
|
||||
'MultilingualWordSegmentationPipeline',
|
||||
'WordSegmentationThaiPipeline'
|
||||
|
||||
59
modelscope/pipelines/nlp/txl_fast_poem_pipeline.py
Normal file
59
modelscope/pipelines/nlp/txl_fast_poem_pipeline.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# Copyright (c) 2022 Zhipu.AI
|
||||
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from modelscope.metainfo import Pipelines
|
||||
from modelscope.models.base import Model
|
||||
from modelscope.models.nlp import TXLForFastPoem
|
||||
from modelscope.pipelines.base import Pipeline, Tensor
|
||||
from modelscope.pipelines.builder import PIPELINES
|
||||
from modelscope.preprocessors import Preprocessor, TXLFastPoemPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
__all__ = ['TXLFastPoemPipeline']
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
group_key=Tasks.fast_poem, module_name=Pipelines.txl_fast_poem)
|
||||
class TXLFastPoemPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[TXLForFastPoem, str],
|
||||
preprocessor: [Preprocessor] = None,
|
||||
*args,
|
||||
**kwargs):
|
||||
model = TXLForFastPoem(model) if isinstance(model, str) else model
|
||||
self.model = model
|
||||
self.model.eval()
|
||||
if preprocessor is None:
|
||||
preprocessor = TXLFastPoemPreprocessor()
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
|
||||
# define the forward pass
|
||||
def forward(self, inputs: Union[Dict, str],
|
||||
**forward_params) -> Dict[str, Any]:
|
||||
if isinstance(inputs, str):
|
||||
inputs = {
|
||||
'title': inputs,
|
||||
'author': '李白',
|
||||
'desc': '寂寞',
|
||||
'lycr': 7,
|
||||
'senlength': 4
|
||||
}
|
||||
else:
|
||||
if 'title' not in inputs:
|
||||
inputs['title'] = '月光'
|
||||
if 'author' not in inputs:
|
||||
inputs['author'] = '李白'
|
||||
if 'desc' not in inputs:
|
||||
inputs['desc'] = '寂寞'
|
||||
if 'lycr' not in inputs:
|
||||
inputs['lycr'] = 7
|
||||
if 'senlength' not in inputs:
|
||||
inputs['senlength'] = 4
|
||||
|
||||
return self.model.generate(inputs)
|
||||
|
||||
# format the outputs from pipeline
|
||||
def postprocess(self, input, **kwargs) -> Dict[str, Any]:
|
||||
return input
|
||||
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
||||
SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor,
|
||||
TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor,
|
||||
TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize,
|
||||
WordSegmentationBlankSetToLabelPreprocessor,
|
||||
WordSegmentationBlankSetToLabelPreprocessor, TXLFastPoemPreprocessor,
|
||||
MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor,
|
||||
TextGenerationJiebaPreprocessor, SentencePiecePreprocessor,
|
||||
DialogIntentPredictionPreprocessor, DialogModelingPreprocessor,
|
||||
@@ -57,7 +57,7 @@ else:
|
||||
'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
|
||||
'Tokenize', 'Text2TextGenerationPreprocessor',
|
||||
'WordSegmentationBlankSetToLabelPreprocessor',
|
||||
'MGLMSummarizationPreprocessor',
|
||||
'MGLMSummarizationPreprocessor', 'TXLFastPoemPreprocessor',
|
||||
'ZeroShotClassificationPreprocessor',
|
||||
'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
|
||||
'NERPreprocessorViet', 'NERPreprocessorThai',
|
||||
|
||||
@@ -30,6 +30,7 @@ if TYPE_CHECKING:
|
||||
from .space_T_en import ConversationalTextToSqlPreprocessor
|
||||
from .space_T_cn import TableQuestionAnsweringPreprocessor
|
||||
from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
|
||||
from .txl_fast_poem_preprocessor import TXLFastPoemPreprocessor
|
||||
else:
|
||||
_import_structure = {
|
||||
'nlp_base': [
|
||||
@@ -64,6 +65,7 @@ else:
|
||||
'TextErrorCorrectionPreprocessor',
|
||||
],
|
||||
'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'],
|
||||
'txl_fast_poem_preprocessor': ['TXLFastPoemPreprocessor'],
|
||||
'token_classification_thai_preprocessor': [
|
||||
'NERPreprocessorThai',
|
||||
'WordSegmentationPreprocessorThai',
|
||||
|
||||
26
modelscope/preprocessors/nlp/txl_fast_poem_preprocessor.py
Normal file
26
modelscope/preprocessors/nlp/txl_fast_poem_preprocessor.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# Copyright (c) 2022 Zhipu.AI
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, Iterable, Optional, Tuple, Union
|
||||
|
||||
from modelscope.metainfo import Models, Preprocessors
|
||||
from modelscope.preprocessors.base import Preprocessor
|
||||
from modelscope.preprocessors.builder import PREPROCESSORS
|
||||
from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
|
||||
from modelscope.utils.type_assert import type_assert
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.nlp, module_name=Preprocessors.txl_fast_poem)
|
||||
class TXLFastPoemPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""preprocess the data
|
||||
Args:
|
||||
model_dir (str): model path
|
||||
"""
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@type_assert(object, (str, tuple, Dict))
|
||||
def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
|
||||
return data
|
||||
@@ -118,6 +118,7 @@ class NLPTasks(object):
|
||||
table_question_answering = 'table-question-answering'
|
||||
fill_mask = 'fill-mask'
|
||||
text_summarization = 'text-summarization'
|
||||
fast_poem = 'fast-poem'
|
||||
question_answering = 'question-answering'
|
||||
zero_shot_classification = 'zero-shot-classification'
|
||||
backbone = 'backbone'
|
||||
|
||||
40
tests/pipelines/test_txl_fast_poem.py
Normal file
40
tests/pipelines/test_txl_fast_poem.py
Normal file
@@ -0,0 +1,40 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.preprocessors import TXLFastPoemPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.demo_utils import DemoCompatibilityCheck
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class TXLTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.output_dir = 'unittest_output'
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_TXL_with_name(self):
|
||||
model = 'ZhipuAI/TransformerXL-Fast-Poem'
|
||||
preprocessor = TXLFastPoemPreprocessor()
|
||||
pipe = pipeline(
|
||||
task=Tasks.fast_poem,
|
||||
model=model,
|
||||
preprocessor=preprocessor,
|
||||
)
|
||||
inputs = {
|
||||
'title': '明月',
|
||||
'author': '杜甫',
|
||||
'desc': '寂寞',
|
||||
'lycr': 7,
|
||||
'senlength': 4
|
||||
}
|
||||
result = pipe(inputs)
|
||||
print(result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user