diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index cbab0e0b..4fd985d0 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -32,6 +32,7 @@ class Models(object): tcrf = 'transformer-crf' bart = 'bart' gpt3 = 'gpt3' + plug = 'plug' # audio models sambert_hifigan = 'sambert-hifigan' diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 24e65ef1..d2b05884 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from .task_models.task_model import SingleBackboneTaskModelBase from .bart_for_text_error_correction import BartForTextErrorCorrection from .gpt3 import GPT3ForTextGeneration + from .plug import PlugForTextGeneration else: _import_structure = { @@ -42,6 +43,7 @@ else: 'task_model': ['SingleBackboneTaskModelBase'], 'bart_for_text_error_correction': ['BartForTextErrorCorrection'], 'gpt3': ['GPT3ForTextGeneration'], + 'plug': ['PlugForTextGeneration'], } import sys diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py new file mode 100644 index 00000000..b74258a4 --- /dev/null +++ b/modelscope/models/nlp/plug/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .configuration_plug import PlugNLGConfig + from .modeling_plug import PlugModel + from .distributed_plug import DistributedPlug + from .plug_for_text_generation import PlugForTextGeneration +else: + _import_structure = { + 'configuration_plug': ['PlugNLGConfig'], + 'modeling_plug': ['PlugModel'], + 'distributed_plug': ['DistributedPlug'], + 'plug_for_text_generation': ['PlugForTextGeneration'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/plug/arguments.py b/modelscope/models/nlp/plug/arguments.py new file mode 100755 index 00000000..e3a0c152 --- /dev/null +++ b/modelscope/models/nlp/plug/arguments.py @@ -0,0 +1,414 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""argparser configuration""" + +import argparse +import os +import torch +import deepspeed + + +def add_model_config_args(parser): + """Model arguments""" + + group = parser.add_argument_group('model', 'model configuration') + + group.add_argument('--pretrained-bert', action='store_true', + help='use a pretrained bert-large-uncased model instead' + 'of initializing from scratch. See ' + '--tokenizer-model-type to specify which pretrained ' + 'BERT model to use') + group.add_argument('--attention-dropout', type=float, default=0.1, + help='dropout probability for attention weights') + group.add_argument('--num-attention-heads', type=int, default=16, + help='num of transformer attention heads') + group.add_argument('--hidden-size', type=int, default=1024, + help='tansformer hidden size') + group.add_argument('--intermediate-size', type=int, default=None, + help='transformer embedding dimension for FFN' + 'set to 4*`--hidden-size` if it is None') + group.add_argument('--num-layers', type=int, default=24, + help='num decoder layers') + group.add_argument('--layernorm-epsilon', type=float, default=1e-5, + help='layer norm epsilon') + group.add_argument('--hidden-dropout', type=float, default=0.1, + help='dropout probability for hidden state transformer') + group.add_argument('--max-position-embeddings', type=int, default=512, + help='maximum number of position embeddings to use') + group.add_argument('--vocab-size', type=int, default=30522, + help='vocab size to use for non-character-level ' + 'tokenization. This value will only be used when ' + 'creating a tokenizer') + group.add_argument('--deep-init', action='store_true', + help='initialize bert model similar to gpt2 model.' + 'scales initialization of projection layers by a ' + 'factor of 1/sqrt(2N). Necessary to train bert ' + 'models larger than BERT-Large.') + group.add_argument('--make-vocab-size-divisible-by', type=int, default=128, + help='Pad the vocab size to be divisible by this value.' + 'This is added for computational efficieny reasons.') + group.add_argument('--cpu-optimizer', action='store_true', + help='Run optimizer on CPU') + group.add_argument('--cpu_torch_adam', action='store_true', + help='Use Torch Adam as optimizer on CPU.') + + return parser + + +def add_fp16_config_args(parser): + """Mixed precision arguments.""" + + group = parser.add_argument_group('fp16', 'fp16 configurations') + + group.add_argument('--fp16', action='store_true', + help='Run model in fp16 mode') + group.add_argument('--fp32-embedding', action='store_true', + help='embedding in fp32') + group.add_argument('--fp32-layernorm', action='store_true', + help='layer norm in fp32') + group.add_argument('--fp32-tokentypes', action='store_true', + help='embedding token types in fp32') + group.add_argument('--fp32-allreduce', action='store_true', + help='all-reduce in fp32') + group.add_argument('--hysteresis', type=int, default=2, + help='hysteresis for dynamic loss scaling') + group.add_argument('--loss-scale', type=float, default=None, + help='Static loss scaling, positive power of 2 ' + 'values can improve fp16 convergence. If None, dynamic' + 'loss scaling is used.') + group.add_argument('--loss-scale-window', type=float, default=1000, + help='Window over which to raise/lower dynamic scale') + group.add_argument('--min-scale', type=float, default=1, + help='Minimum loss scale for dynamic loss scale') + + return parser + + +def add_training_args(parser): + """Training arguments.""" + + group = parser.add_argument_group('train', 'training configurations') + + group.add_argument('--batch-size', type=int, default=4, + help='Data Loader batch size') + group.add_argument('--weight-decay', type=float, default=0.01, + help='weight decay coefficient for L2 regularization') + group.add_argument('--checkpoint-activations', action='store_true', + help='checkpoint activation to allow for training ' + 'with larger models and sequences') + group.add_argument('--checkpoint-num-layers', type=int, default=1, + help='chunk size (number of layers) for checkpointing') + group.add_argument('--deepspeed-activation-checkpointing', action='store_true', + help='uses activation checkpointing from deepspeed') + group.add_argument('--clip-grad', type=float, default=1.0, + help='gradient clipping') + group.add_argument('--train-iters', type=int, default=1000000, + help='total number of iterations to train over all training runs') + group.add_argument('--log-interval', type=int, default=100, + help='report interval') + group.add_argument('--exit-interval', type=int, default=None, + help='Exit the program after this many new iterations.') + + group.add_argument('--seed', type=int, default=1234, + help='random seed') + # Batch prodecuer arguments + group.add_argument('--reset-position-ids', action='store_true', + help='Reset posistion ids after end-of-document token.') + group.add_argument('--reset-attention-mask', action='store_true', + help='Reset self attention maske after ' + 'end-of-document token.') + + # Learning rate. + group.add_argument('--lr-decay-iters', type=int, default=None, + help='number of iterations to decay LR over,' + ' If None defaults to `--train-iters`*`--epochs`') + group.add_argument('--lr-decay-style', type=str, default='linear', + choices=['constant', 'linear', 'cosine', 'exponential'], + help='learning rate decay function') + group.add_argument('--lr', type=float, default=1.0e-4, + help='initial learning rate') + group.add_argument('--warmup', type=float, default=0.01, + help='percentage of data to warmup on (.01 = 1% of all ' + 'training iters). Default 0.01') + group.add_argument('--batch-warmup', type=float, default=0.01, + help='percentage of data to warmup on (.01 = 1% of all ' + 'training iters). Default 0.01') + group.add_argument('--length-warmup', type=float, default=0.01, + help='percentage of data to warmup on (.01 = 1% of all ' + 'training iters). Default 0.01') + # model checkpointing + group.add_argument('--save', type=str, default=None, + help='Output directory to save checkpoints to.') + group.add_argument('--save-interval', type=int, default=2000, + help='number of iterations between saves') + group.add_argument('--no-save-optim', action='store_true', + help='Do not save current optimizer.') + group.add_argument('--no-save-rng', action='store_true', + help='Do not save current rng state.') + group.add_argument('--load', type=str, default=None, + help='Path to a directory containing a model checkpoint.') + group.add_argument('--load-iteration', type=str, default=0, + help='Load iteration of a model checkpoint.') + group.add_argument('--pre-load', action='store_true', + help='Use pre-load instead of deepspeed load.') + group.add_argument('--no-load-optim', action='store_true', + help='Do not load optimizer when loading checkpoint.') + group.add_argument('--no-load-rng', action='store_true', + help='Do not load rng state when loading checkpoint.') + group.add_argument('--no-load-lr', action='store_true', + help='Do not load lr schedule when loading checkpoint.') + group.add_argument('--finetune', action='store_true', + help='Load model for finetuning. Do not load optimizer ' + 'or rng state from checkpoint and set iteration to 0. ' + 'Assumed when loading a release checkpoint.') + group.add_argument('--resume-dataloader', action='store_true', + help='Resume the dataloader when resuming training. ' + 'Does not apply to tfrecords dataloader, try resuming' + 'with a different seed in this case.') + # distributed training args + group.add_argument('--distributed-backend', default='nccl', + help='which backend to use for distributed ' + 'training. One of [gloo, nccl]') + + group.add_argument('--local_rank', type=int, default=None, + help='local rank passed from distributed launcher') + + return parser + + +def add_evaluation_args(parser): + """Evaluation arguments.""" + + group = parser.add_argument_group('validation', 'validation configurations') + + group.add_argument('--eval-batch-size', type=int, default=None, + help='Data Loader batch size for evaluation datasets.' + 'Defaults to `--batch-size`') + group.add_argument('--eval-iters', type=int, default=100, + help='number of iterations to run for evaluation' + 'validation/test for') + group.add_argument('--eval-interval', type=int, default=1000, + help='interval between running evaluation on validation set') + group.add_argument('--eval-seq-length', type=int, default=None, + help='Maximum sequence length to process for ' + 'evaluation. Defaults to `--seq-length`') + group.add_argument('--eval-max-preds-per-seq', type=int, default=None, + help='Maximum number of predictions to use for ' + 'evaluation. Defaults to ' + 'math.ceil(`--eval-seq-length`*.15/10)*10') + group.add_argument('--overlapping-eval', type=int, default=32, + help='sliding window for overlapping eval ') + group.add_argument('--cloze-eval', action='store_true', + help='Evaluation dataset from `--valid-data` is a cloze task') + group.add_argument('--eval-hf', action='store_true', + help='perform evaluation with huggingface openai model.' + 'use `--load` to specify weights path to be loaded') + group.add_argument('--load-openai', action='store_true', + help='load openai weights into our model. Use `--load` ' + 'to specify weights path to be loaded') + + return parser + +def add_text_generate_args(parser): + """Text generate arguments.""" + + group = parser.add_argument_group('Text generation', 'configurations') + group.add_argument("--temperature", type=float, default=1.0) + group.add_argument("--top_p", type=float, default=0.0) + group.add_argument("--top_k", type=int, default=0) + group.add_argument("--out-seq-length", type=int, default=256) + return parser + +def add_struct_args(parser): + group = parser.add_argument_group('struct', 'struct configurations') + group.add_argument("--gradient-accumulation-steps", type=int, default=1, + help='Not Imp yet.') + group.add_argument("--num-epochs", type=int, default=1, + help='Not Imp yet.') + group.add_argument("--struct-bert-dataset", action='store_true', default=False, + help='Use struct bert dataset or not.') + return parser + +def add_palm_args(parser): + group = parser.add_argument_group('palm', 'struct configurations') + group.add_argument('--dec-layers', type=int, default=6, + help='num decoder layers') + group.add_argument('--tgt-length', type=int, default=100, + help='num decoder layers') + group.add_argument('--vae-size', type=int, default=8192, + help='vae code vocab size') + group.add_argument('--max-image-position', type=int, default=1025, + help='max image decode position') + group.add_argument("--palm-dataset", action='store_true', default=False, + help='Use struct bert dataset or not.') + group.add_argument("--image-dataset", action='store_true', default=False, + help='Use struct bert dataset or not.') + group.add_argument("--do-mask-lm", action='store_true', default=False, + help='Do mask lm task or not.') + group.add_argument('--vae-enc-model', type=str, default=None, + help='Path to a directory containing a model checkpoint.') + return parser + +def add_downstream_args(parser): + group = parser.add_argument_group('downstream', 'struct configurations') + group.add_argument("--downstream-dataset", action='store_true', default=False, + help='Use struct bert dataset or not.') + group.add_argument("--task-name", default='ocnli', type=str) + return parser + +def add_data_args(parser): + """Train/valid/test data arguments.""" + + group = parser.add_argument_group('data', 'data configurations') + + group.add_argument('--model-parallel-size', type=int, default=1, + help='size of the model parallel.') + group.add_argument('--shuffle', action='store_true', + help='Shuffle data. Shuffling is deterministic ' + 'based on seed and current epoch.') + group.add_argument('--train-data', nargs='+', default=None, + help='Whitespace separated filenames or corpora names ' + 'for training.') + + group.add_argument('--use-npy-data-loader', action='store_true', + help='Use the numpy data loader. If set, then' + 'train-data-path, val-data-path, and test-data-path' + 'should also be provided.') + group.add_argument('--train-data-path', type=str, default='', + help='path to the training data') + group.add_argument('--val-data-path', type=str, default='', + help='path to the validation data') + group.add_argument('--test-data-path', type=str, default='', + help='path to the test data') + group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt', + help='the filename containing all the shards sizes') + + group.add_argument('--delim', default=',', + help='delimiter used to parse csv data files') + group.add_argument('--text-key', default='sentence', + help='key to use to extract text from json/csv') + group.add_argument('--eval-text-key', default=None, + help='key to use to extract text from ' + 'json/csv evaluation datasets') + group.add_argument('--valid-data', nargs='*', default=None, + help="""Filename for validation data.""") + group.add_argument('--split', default='1000,1,1', + help='comma-separated list of proportions for training,' + ' validation, and test split') + group.add_argument('--test-data', nargs='*', default=None, + help="""Filename for testing""") + + group.add_argument('--lazy-loader', action='store_true', + help='whether to lazy read the data set') + group.add_argument('--loose-json', action='store_true', + help='Use loose json (one json-formatted string per ' + 'newline), instead of tight json (data file is one ' + 'json string)') + group.add_argument('--presplit-sentences', action='store_true', + help='Dataset content consists of documents where ' + 'each document consists of newline separated sentences') + group.add_argument('--num-workers', type=int, default=2, + help="""Number of workers to use for dataloading""") + group.add_argument('--tokenizer-model-type', type=str, + default='bert-large-uncased', + help="Model type to use for sentencepiece tokenization \ + (one of ['bpe', 'char', 'unigram', 'word']) or \ + bert vocab to use for BertWordPieceTokenizer (one of \ + ['bert-large-uncased', 'bert-large-cased', etc.])") + group.add_argument('--tokenizer-path', type=str, default='tokenizer.model', + help='path used to save/load sentencepiece tokenization ' + 'models') + group.add_argument('--tokenizer-type', type=str, + default='BertWordPieceTokenizer', + choices=['CharacterLevelTokenizer', + 'SentencePieceTokenizer', + 'BertWordPieceTokenizer', + 'GPT2BPETokenizer'], + help='what type of tokenizer to use') + group.add_argument("--cache-dir", default=None, type=str, + help="Where to store pre-trained BERT downloads") + group.add_argument('--use-tfrecords', action='store_true', + help='load `--train-data`, `--valid-data`, ' + '`--test-data` from BERT tf records instead of ' + 'normal data pipeline') + group.add_argument('--seq-length', type=int, default=512, + help="Maximum sequence length to process") + group.add_argument('--max-preds-per-seq', type=int, default=None, + help='Maximum number of predictions to use per sequence.' + 'Defaults to math.ceil(`--seq-length`*.15/10)*10.' + 'MUST BE SPECIFIED IF `--use-tfrecords` is True.') + + return parser + +def get_args(): + """Parse all the args.""" + + parser = argparse.ArgumentParser(description='PyTorch BERT Model') + parser = add_model_config_args(parser) + parser = add_fp16_config_args(parser) + parser = add_training_args(parser) + parser = add_evaluation_args(parser) + parser = add_text_generate_args(parser) + parser = add_struct_args(parser) + parser = add_palm_args(parser) + parser = add_downstream_args(parser) + parser = add_data_args(parser) + + # Include DeepSpeed configuration arguments + parser = deepspeed.add_config_arguments(parser) + + args = parser.parse_args() + + args.deepspeed = False + + args.cuda = torch.cuda.is_available() + + args.rank = int(os.getenv('RANK', '0')) + args.world_size = int(os.getenv("WORLD_SIZE", '1')) + + if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'): + # We are using (OpenMPI) mpirun for launching distributed data parallel processes + local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK')) + local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) + + # Possibly running with Slurm + num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1')) + nodeid = int(os.getenv('SLURM_NODEID', '0')) + + args.local_rank = local_rank + args.rank = nodeid*local_size + local_rank + args.world_size = num_nodes*local_size + + args.model_parallel_size = min(args.model_parallel_size, args.world_size) + if args.rank == 0: + print('using world size: {} and model-parallel size: {} '.format( + args.world_size, args.model_parallel_size)) + + args.dynamic_loss_scale = False + if args.loss_scale is None: + args.dynamic_loss_scale = True + if args.rank == 0: + print(' > using dynamic loss scaling') + + # The args fp32_* or fp16_* meant to be active when the + # args fp16 is set. So the default behaviour should all + # be false. + if not args.fp16: + args.fp32_embedding = False + args.fp32_tokentypes = False + args.fp32_layernorm = False + + return args diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration_plug.py new file mode 100644 index 00000000..c05ff127 --- /dev/null +++ b/modelscope/models/nlp/plug/configuration_plug.py @@ -0,0 +1,368 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import copy + +""" BERT model configuration """ +from collections import OrderedDict +from typing import Mapping + +from transformers import PretrainedConfig +from modelscope.utils import logger as logging + +logger = logging.get_logger(__name__) + + +class PlugNLUConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a + :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments, + defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration + to that of the BERT `bert-base-uncased `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + hidden_size (:obj:`int`, `optional`, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, `optional`, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, `optional`, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, `optional`, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. + hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, `optional`, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, `optional`, defaults to 2): + The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`): + Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`, + :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on + :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.) + `__. For more information on :obj:`"relative_key_query"`, please refer to + `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.) + `__. + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if ``config.is_decoder=True``. + classifier_dropout (:obj:`float`, `optional`): + The dropout ratio for the classification head. + + Examples:: + + >>> from transformers import BertModel, BertConfig + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = BertConfig() + + >>> # Initializing a model from the bert-base-uncased style configuration + >>> model = BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type="plugNLU" + + def __init__( + self, + vocab_size=21504, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.00707, + deep_init=False, + deepspeed=False, + lr_decay_style='linear', + weight_decay=1e-2, + clip_grad=1.0, + warmup=0.01, + pre_ln = False, + fp16 = False, + fp32_layernorm=False, + fp32_embedding=False, + fp32_tokentypes=False, + layernorm_epsilon=1e-12, + dec_hidden_layers=6, + pruning_method=None, + pruning_mask_init="constant", + pruning_mask_scale=0.0, + pruning_initial_threshold= 1.0, + pruning_final_threshold=0.01, + pruning_initial_warmup=1, + pruning_final_warmup=20, + pruning_module='decoder', + pruning_decay_step=50, + pruning_decay_type='exp', + ft_module=None, + attn_separate=True, + LR_weight_rank=8, + LR_mask_rank=8, + **kwargs + ): + super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.deep_init = deep_init + self.deepspeed = deepspeed + self.lr_decay_style = lr_decay_style + self.weight_decay = weight_decay + self.clip_grad = clip_grad + self.warmup = warmup + self.pre_ln = pre_ln + self.fp16 = fp16 + self.fp32_layernorm = fp32_layernorm + self.fp32_embedding = fp32_embedding + self.layernorm_epsilon = layernorm_epsilon + self.fp32_tokentypes = fp32_tokentypes + self.dec_hidden_layers = dec_hidden_layers + self.pruning_method = pruning_method + self.pruning_mask_init = pruning_mask_init + self.pruning_mask_scale = pruning_mask_scale + self.pruning_module = pruning_module + self.pruning_initial_threshold = pruning_initial_threshold + self.pruning_final_threshold = pruning_final_threshold + self.pruning_initial_warmup = pruning_initial_warmup + self.pruning_final_warmup = pruning_final_warmup + self.pruning_decay_step = pruning_decay_step + self.pruning_decay_type = pruning_decay_type + self.ft_module = ft_module + self.attn_separate = attn_separate + self.LR_weight_rank = LR_weight_rank + self.LR_mask_rank = LR_mask_rank + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = PlugNLUConfig() + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def merge_args(self, args): + """merge values a `BertConfig` from a json file of parameters.""" + local_keys = self.__dict__.keys() + for key, value in args.__dict__.items(): + if key in local_keys: + continue + self.__dict__[key] = value + return self + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + +class PlugNLGConfig(PlugNLUConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a + :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments, + defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration + to that of the BERT `bert-base-uncased `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + hidden_size (:obj:`int`, `optional`, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, `optional`, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, `optional`, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, `optional`, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. + hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, `optional`, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, `optional`, defaults to 2): + The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`): + Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`, + :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on + :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.) + `__. For more information on :obj:`"relative_key_query"`, please refer to + `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.) + `__. + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if ``config.is_decoder=True``. + classifier_dropout (:obj:`float`, `optional`): + The dropout ratio for the classification head. + + Examples:: + + >>> from transformers import BertModel, BertConfig + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = BertConfig() + + >>> # Initializing a model from the bert-base-uncased style configuration + >>> model = BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type="plugNLG" + + def __init__( + self, + vocab_size=21504, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.00707, + deep_init=False, + deepspeed=False, + lr_decay_style='linear', + weight_decay=1e-2, + clip_grad=1.0, + warmup=0.01, + pre_ln = False, + fp16 = False, + fp32_layernorm=False, + fp32_embedding=False, + fp32_tokentypes=False, + layernorm_epsilon=1e-12, + dec_hidden_layers=6, + pruning_method=None, + pruning_mask_init="constant", + pruning_mask_scale=0.0, + pruning_initial_threshold= 1.0, + pruning_final_threshold=0.01, + pruning_initial_warmup=1, + pruning_final_warmup=20, + pruning_module='decoder', + pruning_decay_step=50, + pruning_decay_type='exp', + ft_module=None, + attn_separate=False, + LR_weight_rank=8, + LR_mask_rank=8, + **kwargs + ): + super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.deep_init = deep_init + self.deepspeed = deepspeed + self.lr_decay_style = lr_decay_style + self.weight_decay = weight_decay + self.clip_grad = clip_grad + self.warmup = warmup + self.pre_ln = pre_ln + self.fp16 = fp16 + self.fp32_layernorm = fp32_layernorm + self.fp32_embedding = fp32_embedding + self.layernorm_epsilon = layernorm_epsilon + self.fp32_tokentypes = fp32_tokentypes + self.dec_hidden_layers = dec_hidden_layers + self.pruning_method = pruning_method + self.pruning_mask_init = pruning_mask_init + self.pruning_mask_scale = pruning_mask_scale + self.pruning_module = pruning_module + self.pruning_initial_threshold = pruning_initial_threshold + self.pruning_final_threshold = pruning_final_threshold + self.pruning_initial_warmup = pruning_initial_warmup + self.pruning_final_warmup = pruning_final_warmup + self.pruning_decay_step = pruning_decay_step + self.pruning_decay_type = pruning_decay_type + self.ft_module = ft_module + self.attn_separate = attn_separate + self.LR_weight_rank = LR_weight_rank + self.LR_mask_rank = LR_mask_rank diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py new file mode 100644 index 00000000..ae6baf68 --- /dev/null +++ b/modelscope/models/nlp/plug/distributed_plug.py @@ -0,0 +1,198 @@ +import random +import torch +import numpy as np +import torch.nn.functional as F +from typing import Dict + +from . import PlugModel +from modelscope.models.base import Tensor +from modelscope.utils.nlp import mpu +from modelscope.utils.nlp.utils import print_rank_0 +from modelscope.utils.nlp.fp16 import FP16_Module +from modelscope.utils.nlp.distributed import DistributedDataParallel as DDP + +import os +from modelscope.utils.torch_utils import init_dist +def initialize_distributed(rank): + """Initialize torch.distributed.""" + # Manually set the device ids. + #torch.multiprocessing.set_start_method("spawn") + device = rank % torch.cuda.device_count() + torch.cuda.set_device(device) + # Call the init process + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', '127.0.0.1') + master_port = os.getenv('MASTER_PORT', '12345') + init_method += master_ip + ':' + master_port + torch.distributed.init_process_group( + backend="nccl", + world_size=8, rank=rank, + init_method=init_method) + # Set the model-parallel communicators. + mpu.initialize_model_parallel(8) + +def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): + # This function has been mostly taken from huggingface conversational ai code at + # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313 + + if top_k > 0: + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + #convert to 1D + logits=logits.view(logits.size()[1]).contiguous() + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probs > top_p + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + indices_to_remove = sorted_indices[sorted_indices_to_remove] + logits[indices_to_remove] = filter_value + #going back to 2D + logits=logits.view(1, -1).contiguous() + + return logits + + + +class DistributedPlug: + @classmethod + def init(cls, rank, model_dir, model_config, args): + #def init(cls, rank): + #torch.backends.cudnn.enabled = False + # + cls.rank = rank + cls.args = args + cls.config = model_config + cls.model_dir = model_dir + initialize_distributed(rank) + cls.set_random_seed(cls, args.seed) + cls.setup_model(cls, path_load_tag='model') + + def set_random_seed(cls, seed): + if seed is not None and seed > 0: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + mpu.model_parallel_cuda_manual_seed(seed) + + def get_model(cls): + """Build the model.""" + + print_rank_0('Building Plug model. It will take a few minutes ...') + model = PlugModel(cls.config) + + if mpu.get_data_parallel_rank() == 0: + print(' > number of parameters on model parallel rank {}: {}'.format( + mpu.get_model_parallel_rank(), + sum([p.nelement() for p in model.parameters()])), flush=True) + + if cls.args.deepspeed and cls.args.fp16: + model.half() + + # GPU allocation. + model.cuda(torch.cuda.current_device()) + + # Fp16 conversion. + if cls.args.fp16: + model = FP16_Module(model) + if cls.args.fp32_embedding: + model.module.model.bert.embeddings.word_embeddings.float() + model.module.model.bert.embeddings.position_embeddings.float() + model.module.model.bert.embeddings.token_type_embeddings.float() + if cls.args.fp32_tokentypes: + model.module.model.bert.embeddings.token_type_embeddings.float() + if cls.args.fp32_layernorm: + for name, _module in model.named_modules(): + if 'LayerNorm' in name: + _module.float() + + # model = DDP(model) + + return model + + def setup_model(cls, path_load_tag='model'): + dist_model = cls.get_model(cls) + if cls.model_dir is not None: + from modelscope.utils.nlp.load_checkpoint import pre_load + load_model = pre_load(mpu, cls.model_dir, tag=path_load_tag) + # model_dict = dist_model.module.module.model.state_dict() + model_dict = dist_model.module.model.state_dict() + for key in load_model: + if key not in model_dict.keys(): + print_rank_0('Skip key: '+key) + else: + print_rank_0('Loading key: '+key) + # dist_model.module.module.model.load_state_dict(load_model, strict=False) + dist_model.module.model.load_state_dict(load_model, strict=False) + cls.args.iteration = 0 + cls.dist_model = dist_model + + @classmethod + def forward(cls, input:Dict[str, Tensor]): + device = torch.cuda.current_device() + tokens = input["input_ids"].to(device) + dec_input_ids = input["dec_input_ids"].to(device) + attention_mask = input["attention_mask"].to(device) + cls.dist_model.eval() + seq_length = 128 + with torch.no_grad(): + all_generate_tokens = [] + generate_tokens = [] + counter = 0 + sequence_output = None + vocab_size = 21128 + #tokens, attention_mask, types, dec_input_ids = get_batch(context_tokens_tensor, device, args) + while counter < seq_length: + # if counter % 128 == 0 and counter != 0: + # generate_tokens.append(tokenizer.vocab[args.sep_token]) + # start = (context_tokens_tensor == 102).nonzero(as_tuple=True)[-1] + # if start + len(generate_tokens) >= 512: + # context_tokens_tensor = torch.cat([context_tokens_tensor[:start], torch.cuda.LongTensor(generate_tokens)], -1)[-512:] + # else: + # context_tokens_tensor[start:start+len(generate_tokens)] = torch.cuda.LongTensor(generate_tokens) + # tokens, attention_mask, types, dec_input_ids = get_batch(context_tokens_tensor, device, args) + # generate_tokens = [] + # sequence_output = None + + position_ids = torch.full([cls.args.batch_size, 1], len(generate_tokens), dtype=torch.long, device=device) + _, logits, sequence_output = cls.dist_model(tokens, None, attention_mask, dec_input_ids, attention_mask, position_ids, is_infer=True, sequence_output=sequence_output, parallel_output=False) + + partition_vocab_size = logits.size()[-1] + + logits = logits[:, -1, :] + logits = logits / cls.args.temperature + logits = top_k_logits(logits, top_k=cls.args.top_k, top_p=cls.args.top_p) + log_probs = F.softmax(logits, dim=-1) + prev = torch.multinomial(log_probs, num_samples=1) + prev_token = prev[0].item() + if prev_token >= vocab_size: #or prev_token == 102: + prev_token = 100 + prev[0] = 100 + # if prev_token == 102 and len(all_generate_tokens) > int(max(1, length) * 0.8): + if prev_token == 102: + break + #if prev_token == 102: + # counter += 1 + # continue + #if prev_token == 100: + # counter += 1 + # continue + dec_input_ids = torch.cat([dec_input_ids, prev], dim=1) + generate_tokens.append(prev_token) + all_generate_tokens.append(prev_token) + counter += 1 + + generate_context = [] + for token in all_generate_tokens: + if generate_context and generate_context[-1] == 100 and token == 100: + continue + else: + generate_context.append(token) + return {"generate_context": generate_context} + diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/modeling_plug.py new file mode 100644 index 00000000..04d5fa7a --- /dev/null +++ b/modelscope/models/nlp/plug/modeling_plug.py @@ -0,0 +1,1027 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import copy +import json +import math +import logging +import tarfile +import tempfile +import shutil + +import torch +from torch import nn +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss + +from .configuration_plug import PlugNLUConfig, PlugNLGConfig +from ....utils.nlp import mpu#, cached_path +import copy +from deepspeed.utils.timer import SynchronizedWallClockTimer + +def normal_init_method(mean, std): + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=mean, std=std) + return init_ + +def scaled_init_method(mean, std, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = std / math.sqrt(2.0 * num_layers) + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=mean, std=std) + + return init_ + +logger = logging.getLogger(__name__) + +PRETRAINED_MODEL_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", +} +CONFIG_NAME = 'bert_config.json' +WEIGHTS_NAME = 'pytorch_model.bin' +TF_WEIGHTS_NAME = 'model.ckpt' + +def load_tf_weights_in_bert(model, tf_checkpoint_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(tf_checkpoint_path) + print("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + print("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m"] for n in name): + print("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + """Implementation of the gelu activation function. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} + +class BertLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-12): + """Construct a layernorm module in the TF style (epsilon inside the square root). + """ + super(BertLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.weight * x + self.bias + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + def __init__(self, config): + super(BertEmbeddings, self).__init__() + self.word_embeddings = mpu.VocabParallelEmbedding( + config.vocab_size, config.hidden_size, + init_method=normal_init_method(mean=0.0, + std=config.initializer_range)) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.fp32_layernorm = config.fp32_layernorm + self.fp32_embedding = config.fp32_embedding + self.fp32_tokentypes = config.fp32_tokentypes + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None, position_ids=None): + seq_length = input_ids.size(1) + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + if not self.fp32_tokentypes: + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + if self.fp32_embedding and not self.fp32_layernorm: + embeddings = embeddings.half() + previous_type = embeddings.type() + if self.fp32_layernorm: + embeddings = embeddings.float() + embeddings = self.LayerNorm(embeddings) + if self.fp32_layernorm: + if self.fp32_embedding: + embeddings = embeddings.half() + else: + embeddings = embeddings.type(previous_type) + else: + embeddings = words_embeddings.float() + position_embeddings.float() + token_type_embeddings.float() + if self.fp32_tokentypes and not self.fp32_layernorm: + embeddings = embeddings.half() + previous_type = embeddings.type() + if self.fp32_layernorm: + embeddings = embeddings.float() + embeddings = self.LayerNorm(embeddings) + if self.fp32_layernorm: + if self.fp32_tokentypes: + embeddings = embeddings.half() + else: + embeddings = embeddings.type(previous_type) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + if hasattr(config, 'deep_init') and config.deep_init: + init_method = scaled_init_method(mean=0.0, + std=config.initializer_range, + num_layers=config.num_hidden_layers) + else: + init_method = normal_init_method(mean=0.0, + std=config.initializer_range) + self.dense = mpu.RowParallelLinear( + input_size=config.hidden_size, + output_size=config.hidden_size, + bias=True, + input_is_parallel=True, + stride=1, + init_method=init_method, + pruning_method=config.pruning_method if config.pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfvo', 'encoder_selfo'] else None, + pruning_mask_init=config.pruning_mask_init, + pruning_mask_scale=config.pruning_mask_scale, + LR_weight_rank=config.LR_weight_rank, + LR_mask_rank=config.LR_mask_rank) + self.fp32_layernorm = config.fp32_layernorm + if not config.pre_ln: + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + else: + self.LayerNorm = None + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor, pruning_threshold=None,): + hidden_states = self.dense(hidden_states, pruning_threshold=pruning_threshold,) + hidden_states = self.dropout(hidden_states) + ln_input = hidden_states + input_tensor + if self.LayerNorm is not None: + previous_type = ln_input.type() + if self.fp32_layernorm: + ln_input = ln_input.float() + hidden_states = self.LayerNorm(ln_input) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + else: + hidden_states = ln_input + return hidden_states + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.fp32_layernorm = config.fp32_layernorm + if config.pre_ln: + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + else: + self.LayerNorm = None + self.self = mpu.BertParallelSelfAttention( + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + dropout_prob=config.attention_probs_dropout_prob, + output_parallel=True, + init_method=normal_init_method(mean=0.0, + std=config.initializer_range), + separate=config.attn_separate, + pruning_method=config.pruning_method, + pruning_mask_init=config.pruning_mask_init, + pruning_mask_scale=config.pruning_mask_scale, + pruning_module=config.pruning_module, + LR_weight_rank=config.LR_weight_rank, + LR_mask_rank=config.LR_mask_rank) + self.output = BertSelfOutput(config) + + def forward(self, input_tensor, attention_mask, pruning_threshold=None,): + if self.LayerNorm is not None: + ln_input = input_tensor + previous_type = input_tensor.type() + if self.fp32_layernorm: + ln_input = input_tensor.float() + ln_output = self.LayerNorm(ln_input) + if self.fp32_layernorm: + ln_output = ln_output.type(previous_type) + self_output = self.self(ln_output, attention_mask, pruning_threshold=pruning_threshold,) + else: + self_output = self.self(input_tensor, attention_mask, pruning_threshold=pruning_threshold,) + # output_pruning_threshold = 1 - (1 - pruning_threshold)/0.99*0.95 + output_pruning_threshold = pruning_threshold + + attention_output = self.output(self_output, input_tensor, pruning_threshold=output_pruning_threshold,) + return attention_output + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = mpu.ColumnParallelLinear( + input_size=config.hidden_size, + output_size=config.intermediate_size, + bias=True, + gather_output=False, + stride=1, + init_method=normal_init_method(mean=0.0, std=config.initializer_range), + pruning_method=config.pruning_method if config.pruning_module in ['all', 'encoder', 'encoder_ffn'] else None, + pruning_mask_init=config.pruning_mask_init, + pruning_mask_scale=config.pruning_mask_scale, + LR_weight_rank=config.LR_weight_rank, + LR_mask_rank=config.LR_mask_rank) + self.intermediate_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + + def forward(self, hidden_states, pruning_threshold=None,): + hidden_states = self.dense(hidden_states, pruning_threshold=pruning_threshold,) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + if hasattr(config, 'deep_init') and config.deep_init: + init_method = scaled_init_method(mean=0.0, + std=config.initializer_range, + num_layers=config.num_hidden_layers) + else: + init_method = normal_init_method(mean=0.0, + std=config.initializer_range) + self.dense = mpu.RowParallelLinear( + input_size=config.intermediate_size, + output_size=config.hidden_size, + bias=True, + input_is_parallel=True, + stride=1, + init_method=init_method, + pruning_method=config.pruning_method if config.pruning_module in ['all', 'encoder', 'encoder_ffn'] else None, + pruning_mask_init=config.pruning_mask_init, + pruning_mask_scale=config.pruning_mask_scale, + LR_weight_rank=config.LR_weight_rank, + LR_mask_rank=config.LR_mask_rank) + self.fp32_layernorm = config.fp32_layernorm + if not config.pre_ln: + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + else: + self.LayerNorm = None + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor, pruning_threshold=None,): + hidden_states = self.dense(hidden_states, pruning_threshold=pruning_threshold,) + hidden_states = self.dropout(hidden_states) + ln_input = hidden_states + input_tensor + if self.LayerNorm is not None: + previous_type = ln_input.type() + if self.fp32_layernorm: + ln_input = ln_input.float() + hidden_states = self.LayerNorm(ln_input) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + else: + hidden_states = ln_input + return hidden_states + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + self.fp32_layernorm = config.fp32_layernorm + if config.pre_ln: + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + else: + self.LayerNorm = None + + def forward(self, hidden_states, attention_mask, pruning_threshold=None,): + attention_output = self.attention(hidden_states, attention_mask, pruning_threshold=pruning_threshold) + if self.LayerNorm is not None: + ln_input = attention_output + previous_type = attention_output.type() + if self.fp32_layernorm: + ln_input = attention_output.float() + ln_output = self.LayerNorm(ln_input) + if self.fp32_layernorm: + ln_output = ln_output.type(previous_type) + intermediate_output = self.intermediate(ln_output, pruning_threshold=pruning_threshold) + else: + intermediate_output = self.intermediate(attention_output, pruning_threshold=pruning_threshold) + layer_output = self.output(intermediate_output, attention_output, pruning_threshold=pruning_threshold) + return layer_output + +class BertEncoder(nn.Module): + def __init__(self, config): + super(BertEncoder, self).__init__() + self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.fp32_layernorm = config.fp32_layernorm + if config.pre_ln: + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + else: + self.LayerNorm = None + + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False, detach_index=-1, pruning_threshold=None,): + all_encoder_layers = [] + def custom(start, end): + def custom_forward(*inputs): + layers = self.layer[start:end] + x_ = inputs[0] + for layer in layers: + x_ = layer(x_, inputs[1], pruning_threshold=pruning_threshold) + return x_ + return custom_forward + + if checkpoint_activations: + l = 0 + num_layers = len(self.layer) + chunk_length = 1 #math.ceil(math.sqrt(num_layers)) + while l < num_layers: + hidden_states = mpu.checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1) + if detach_index == l: + hidden_states.detach_() + l += chunk_length + # decoder layers + else: + for i,layer_module in enumerate(self.layer): + hidden_states = layer_module(hidden_states, attention_mask) + if detach_index == i: + hidden_states.detach_() + if i == len(self.layer) - 1 and self.LayerNorm is not None: + previous_type = hidden_states.type() + if self.fp32_layernorm: + hidden_states = hidden_states.float() + hidden_states = self.LayerNorm(hidden_states) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + if output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + + if not output_all_encoded_layers or checkpoint_activations: + if self.LayerNorm is not None: + previous_type = hidden_states.type() + if self.fp32_layernorm: + hidden_states = hidden_states.float() + hidden_states = self.LayerNorm(hidden_states) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + all_encoder_layers.append(hidden_states) + return all_encoder_layers + +class BertPooler(nn.Module): + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.transform_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.fp32_layernorm = config.fp32_layernorm + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + previous_type = hidden_states.type() + if self.fp32_layernorm: + hidden_states = hidden_states.float() + hidden_states = self.LayerNorm(hidden_states) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + return hidden_states + +class BertLMPredictionHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + #self.decoder = nn.Linear(bert_model_embedding_weights.size(1), + # bert_model_embedding_weights.size(0), + # bias=False) + self.decoder_weight = bert_model_embedding_weights + self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0))) + self.bias.model_parallel = True + self.fp32_embedding = config.fp32_embedding + self.fp32_layernorm = config.fp32_layernorm + def convert_to_type(tensor): + if self.fp32_embedding: + return tensor.half() + else: + return tensor + self.type_converter = convert_to_type + self.converted = False + self.timers = SynchronizedWallClockTimer() + + def forward(self, hidden_states): + if not self.converted: + self.converted = True + if self.fp32_embedding: + self.transform.half() + if self.fp32_layernorm: + self.transform.LayerNorm.float() + hidden_states = self.transform(self.type_converter(hidden_states)) + # hidden_states = self.decoder(hidden_states) + self.bias + self.timers('final linear gather').start() + hidden_states = mpu.copy_to_model_parallel_region(hidden_states) + self.timers('final linear gather').stop() + hidden_states = F.linear(self.type_converter(hidden_states), + self.type_converter(self.decoder_weight), + self.type_converter(self.bias)) + #self.timers.log(names=['final linear gather']) + return hidden_states + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + self.seq_relationship = nn.Linear(config.hidden_size, 3) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + for p in self.seq_relationship.parameters(): + if p is None: + continue + pooled_output = pooled_output.type_as(p) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + +class PreTrainedBertModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + def __init__(self, config, *inputs, **kwargs): + super(PreTrainedBertModel, self).__init__() + if not isinstance(config, PlugNLUConfig) and not isinstance(config, PlugNLGConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + )) + self.config = config + + def init_bert_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + #@classmethod + #def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, + # fp32_layernorm=False, fp32_embedding=False, layernorm_epsilon=1e-12, + # fp32_tokentypes=False, *inputs, **kwargs): + # """ + # Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict. + # Download and cache the pre-trained model file if needed. + + # Params: + # pretrained_model_name: either: + # - a str with the name of a pre-trained model to load selected in the list of: + # . `bert-base-uncased` + # . `bert-large-uncased` + # . `bert-base-cased` + # . `bert-large-cased` + # . `bert-base-multilingual-uncased` + # . `bert-base-multilingual-cased` + # . `bert-base-chinese` + # - a path or url to a pretrained model archive containing: + # . `bert_config.json` a configuration file for the model + # . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + # cache_dir: an optional path to a folder in which the pre-trained models will be cached. + # state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + # *inputs, **kwargs: additional input for the specific Bert class + # (ex: num_labels for BertForSequenceClassification) + # """ + # if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP: + # archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name] + # else: + # archive_file = pretrained_model_name + # # redirect to the cache, if necessary + # try: + # resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + # except FileNotFoundError: + # logger.error( + # "Model name '{}' was not found in model name list ({}). " + # "We assumed '{}' was a path or url but couldn't find any file " + # "associated to this path or url.".format( + # pretrained_model_name, + # ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), + # archive_file)) + # return None + # if resolved_archive_file == archive_file: + # logger.info("loading archive file {}".format(archive_file)) + # else: + # logger.info("loading archive file {} from cache at {}".format( + # archive_file, resolved_archive_file)) + # tempdir = None + # if os.path.isdir(resolved_archive_file): + # serialization_dir = resolved_archive_file + # else: + # # Extract archive to temp dir + # tempdir = tempfile.mkdtemp() + # logger.info("extracting archive file {} to temp dir {}".format( + # resolved_archive_file, tempdir)) + # with tarfile.open(resolved_archive_file, 'r:gz') as archive: + # archive.extractall(tempdir) + # serialization_dir = tempdir + # # Load config + # config_file = os.path.join(serialization_dir, CONFIG_NAME) + # config = PlugNLUConfig.from_json_file(config_file) + # config.fp32_layernorm = fp32_layernorm + # config.fp32_embedding = fp32_embedding + # config.layernorm_epsilon = layernorm_epsilon + # config.fp32_tokentypes = fp32_tokentypes + # logger.info("Model config {}".format(config)) + # # Instantiate model. + # model = cls(config, *inputs, **kwargs) + # if state_dict is None: + # weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) + # state_dict = torch.load(weights_path) + + # old_keys = [] + # new_keys = [] + # for key in state_dict.keys(): + # new_key = None + # if 'gamma' in key: + # new_key = key.replace('gamma', 'weight') + # if 'beta' in key: + # new_key = key.replace('beta', 'bias') + # if new_key: + # old_keys.append(key) + # new_keys.append(new_key) + # for old_key, new_key in zip(old_keys, new_keys): + # state_dict[new_key] = state_dict.pop(old_key) + + # missing_keys = [] + # unexpected_keys = [] + # error_msgs = [] + # # copy state_dict so _load_from_state_dict can modify it + # metadata = getattr(state_dict, '_metadata', None) + # state_dict = state_dict.copy() + # if metadata is not None: + # state_dict._metadata = metadata + + # def load(module, prefix=''): + # local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + # module._load_from_state_dict( + # state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + # for name, child in module._modules.items(): + # if child is not None: + # load(child, prefix + name + '.') + # load(model, prefix='' if hasattr(model, 'bert') else 'bert.') + # if len(missing_keys) > 0: + # logger.info("Weights of {} not initialized from pretrained model: {}".format( + # model.__class__.__name__, missing_keys)) + # if len(unexpected_keys) > 0: + # logger.info("Weights from pretrained model not used in {}: {}".format( + # model.__class__.__name__, unexpected_keys)) + # if tempdir: + # # Clean up temp dir + # shutil.rmtree(tempdir) + # return model + +class BertModel(PreTrainedBertModel): + """BERT model ("Bidirectional Embedding Representations from a Transformer"). + + Params: + config: a BertConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. + + Outputs: Tuple of (encoded_layers, pooled_output) + `encoded_layers`: controled by `output_all_encoded_layers` argument: + - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end + of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each + encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], + - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding + to the last attention block of shape [batch_size, sequence_length, hidden_size], + `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a + classifier pretrained on top of the hidden state associated to the first character of the + input (`CLF`) to train on the Next-Sentence task (see BERT's paper). + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = modeling.BertModel(config=config) + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config): + super(BertModel, self).__init__(config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, checkpoint_activations=False, detach_index=-1, pruning_threshold=None,): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.encoder.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + embedding_output = self.embeddings(input_ids, token_type_ids) + encoded_layers = self.encoder(embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers, + checkpoint_activations=checkpoint_activations, + detach_index=detach_index, + pruning_threshold=pruning_threshold) + sequence_output = encoded_layers[-1] + for p in self.pooler.parameters(): + if p is None: + continue + sequence_output = sequence_output.type_as(p) + break + #pooled_output = self.pooler(sequence_output) + pooled_output = sequence_output[:, 0] + if not output_all_encoded_layers or checkpoint_activations: + encoded_layers = encoded_layers[-1] + return encoded_layers, pooled_output + + +class DecodeLayer(nn.Module): + def __init__(self, config): + super(DecodeLayer, self).__init__() + init_method = normal_init_method(mean=0.0,std=config.initializer_range) + output_layer_init_method = scaled_init_method(mean=0.0, + std=config.initializer_range, + num_layers=config.num_hidden_layers) + + self_pruning_method = config.pruning_method + cross_pruning_method = config.pruning_method + ffn_pruning_method = config.pruning_method + + if config.ft_module is not None: + if 'decoder_self' in config.ft_module: + self_pruning_method = 'finetune' + if 'decoder_cross' in config.ft_module: + cross_pruning_method = 'finetune' + if 'decoder_ffn' in config.ft_module: + ffn_pruning_method = 'finetune' + + self.attention = mpu.GPT2ParallelSelfAttention( + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + attention_dropout_prob=config.attention_probs_dropout_prob, + output_dropout_prob=config.hidden_dropout_prob, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + pruning_method=self_pruning_method if config.pruning_module in ['all', 'decoder', 'decoder_self', 'decoder_self+ffn'] else None, + pruning_mask_init=config.pruning_mask_init, pruning_mask_scale=config.pruning_mask_scale, + LR_weight_rank=config.LR_weight_rank, + LR_mask_rank=config.LR_mask_rank, + ) + + self.cross_attention = mpu.PalmParallelCrossAttention( + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + attention_dropout_prob=config.attention_probs_dropout_prob, + output_dropout_prob=config.hidden_dropout_prob, + init_method=init_method, attn_separate=False, + output_layer_init_method=output_layer_init_method, + pruning_method=cross_pruning_method, pruning_mask_init=config.pruning_mask_init, + pruning_mask_scale=config.pruning_mask_scale, pruning_module=config.pruning_module, + LR_weight_rank=config.LR_weight_rank, + LR_mask_rank=config.LR_mask_rank,) + + self.input_layernorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.post_attention_layernorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.post_cross_attention_layernorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + + self.intermediate = mpu.ColumnParallelLinear(config.hidden_size, config.intermediate_size, gather_output=False, init_method=init_method, + pruning_method=ffn_pruning_method if config.pruning_module in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None, + pruning_mask_init=config.pruning_mask_init, pruning_mask_scale=config.pruning_mask_scale, + LR_weight_rank=config.LR_weight_rank, LR_mask_rank=config.LR_mask_rank,) + self.intermediate_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + self.output = mpu.RowParallelLinear(config.intermediate_size, config.hidden_size, input_is_parallel=True, init_method=output_layer_init_method, + pruning_method=ffn_pruning_method if config.pruning_module in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None, + pruning_mask_init=config.pruning_mask_init, pruning_mask_scale=config.pruning_mask_scale, + LR_weight_rank=config.LR_weight_rank, LR_mask_rank=config.LR_mask_rank,) + + self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) + self.fp32_layernorm = config.fp32_layernorm + def convert_to_type(tensor): + if self.fp32_layernorm: + return tensor.float() + else: + return tensor + self.type_converter = convert_to_type + + + #def forward(self, hidden_states, enc_attn_mask, dec_attn_mask): + def forward(self, hidden_states, enc_hidden_states, enc_attn_mask, dec_attn_mask, is_infer=False, pruning_threshold=None): + residual = hidden_states + previous_type = hidden_states.type() + hidden_states = self.input_layernorm(self.type_converter(hidden_states)) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + hidden_states = self.attention(hidden_states, dec_attn_mask, is_infer=is_infer, pruning_threshold=pruning_threshold) + # add dropout? + # hidden_states = self.dropout(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.post_attention_layernorm(self.type_converter(hidden_states)) + if self.fp32_layernorm: + # same to the output of BertAttention + hidden_states = hidden_states.type(previous_type) + hidden_states = self.cross_attention(hidden_states, enc_hidden_states, enc_attn_mask, pruning_threshold=pruning_threshold) + # hidden_states = self.dropout(hidden_states) + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.post_cross_attention_layernorm(self.type_converter(hidden_states)) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + hidden_states = self.intermediate(hidden_states, pruning_threshold=pruning_threshold) + hidden_states = self.intermediate_act_fn(hidden_states) + # hidden_states = self.dropout(hidden_states) + + hidden_states = self.output(hidden_states, pruning_threshold=pruning_threshold) + hidden_states = self.dropout(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + +class BertDecoder(nn.Module): + def __init__(self, config): + super(BertDecoder, self).__init__() + self.layer = nn.ModuleList([DecodeLayer(config) for _ in range(config.dec_hidden_layers)]) + + self.final_layernorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.fp32_layernorm = config.fp32_layernorm + + def forward(self, hidden_states, enc_hidden_states, enc_attn_mask, dec_attn_mask, checkpoint_activations=False, output_all_encoded_layers=False, is_infer=False, pruning_threshold=None): + all_encoder_layers = [] + def custom(start, end): + def custom_forward(*inputs): + layers = self.layer[start:end] + x_ = inputs[0] + for layer in layers: + x_ = layer(x_, inputs[1], inputs[2], dec_attn_mask*1, is_infer=is_infer, pruning_threshold=pruning_threshold) + return x_ + return custom_forward + + pre_enc_hidden= enc_hidden_states.data + if checkpoint_activations: + l = 0 + num_layers = len(self.layer) + chunk_length = 1 #math.ceil(math.sqrt(num_layers)) + while l < num_layers: + hidden_states = mpu.checkpoint(custom(l, l+chunk_length), hidden_states, enc_hidden_states, enc_attn_mask*1) + enc_hidden_states.data = pre_enc_hidden + l += chunk_length + else: + for i,layer_module in enumerate(self.layer): + hidden_states = layer_module(hidden_states, enc_hidden_states, enc_attn_mask, dec_attn_mask, is_infer=is_infer, pruning_threshold=pruning_threshold) + + previous_type = hidden_states.type() + if self.fp32_layernorm: + hidden_states = hidden_states.float() + hidden_states = self.final_layernorm(hidden_states) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + + return [hidden_states] + +class DecodeModel(PreTrainedBertModel): + + def __init__(self, config): + super(DecodeModel, self).__init__(config) + self.decoder = BertDecoder(config) + self.apply(self.init_bert_weights) + + def forward(self, embeddings, sequence_output, decode_input_ids, position_ids=None, enc_attn_mask=None, dec_attn_mask=None, checkpoint_activations=False, is_infer=False, pruning_threshold=None): + + extended_attention_mask = enc_attn_mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=next(self.decoder.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + embedding_output = embeddings(decode_input_ids) + sequence_output = self.decoder(embedding_output, + sequence_output, + extended_attention_mask, + dec_attn_mask, + checkpoint_activations=False, + is_infer=is_infer, + pruning_threshold=pruning_threshold) + return sequence_output[-1] + +class PalmForPreTraining(PreTrainedBertModel): + def __init__(self, config): + super(PalmForPreTraining, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) + self.decoder = DecodeModel(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, decode_input_ids=None, position_ids=None, decode_attention_mask=None, lm_labels=None, checkpoint_activations=False, is_infer=False, sequence_output=None, parallel_output=True, pruning_threshold=None): + if sequence_output is None: + sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations, pruning_threshold=pruning_threshold) + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + else: + prediction_scores = None + seq_relationship_score = None + sequence_output = sequence_output.to(dtype=next(self.decoder.parameters()).dtype) + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + decode_output = self.decoder(self.bert.embeddings, sequence_output, decode_input_ids, position_ids, attention_mask, decode_attention_mask, checkpoint_activations=checkpoint_activations, is_infer=is_infer, pruning_threshold=pruning_threshold) + + #prediction_scores = self.cls(decode_output) + + transformer_output_parallel = mpu.copy_to_model_parallel_region( + decode_output) + + logits_parallel = F.linear(transformer_output_parallel, + self.bert.embeddings.word_embeddings.weight) + + if parallel_output: + return prediction_scores, logits_parallel + if is_infer: + return prediction_scores, mpu.gather_from_model_parallel_region(logits_parallel), sequence_output + return prediction_scores, mpu.gather_from_model_parallel_region(logits_parallel) + +class PlugModel(torch.nn.Module): + + def __init__(self, config): + super(PlugModel, self).__init__() + if config.intermediate_size is None: + intermediate_size = 4 * config.hidden_size + else: + intermediate_size = config.intermediate_size + self.config = config + # self.config = BertConfig( + # args.tokenizer_num_tokens, + # hidden_size=args.hidden_size, + # num_hidden_layers=args.num_layers, + # num_attention_heads=args.num_attention_heads, + # intermediate_size=intermediate_size, + # hidden_dropout_prob=args.hidden_dropout, + # attention_probs_dropout_prob=args.attention_dropout, + # max_position_embeddings=args.max_position_embeddings, + # type_vocab_size=args.tokenizer_num_type_tokens, + # fp32_layernorm=args.fp32_layernorm, + # fp32_embedding=args.fp32_embedding, + # fp32_tokentypes=args.fp32_tokentypes, + # layernorm_epsilon=args.layernorm_epsilon, + # deep_init=args.deep_init, + # dec_hidden_layers=args.dec_layers) + self.model = PalmForPreTraining(self.config) + + def forward(self, input_tokens, token_type_ids=None, + attention_mask=None, target_tokens=None, position_ids=None, decode_attention_mask=None, checkpoint_activations=False, is_infer=False, sequence_output=None, parallel_output=True): + return self.model( + input_tokens, token_type_ids, attention_mask, target_tokens, position_ids, + decode_attention_mask, checkpoint_activations=checkpoint_activations, is_infer=is_infer, sequence_output=sequence_output, parallel_output=parallel_output) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + return self.model.state_dict(destination=destination, prefix=prefix, + keep_vars=keep_vars) + + def load_state_dict(self, state_dict, strict=True): + return self.model.load_state_dict(state_dict, strict=strict) + + diff --git a/modelscope/models/nlp/plug/plug_for_text_generation.py b/modelscope/models/nlp/plug/plug_for_text_generation.py new file mode 100644 index 00000000..2875ebeb --- /dev/null +++ b/modelscope/models/nlp/plug/plug_for_text_generation.py @@ -0,0 +1,57 @@ +import torch +from typing import Dict +from functools import partial + +from . import DistributedPlug +from ...base import Tensor, TorchModel +from ...builder import MODELS +from ....metainfo import Models +from ....outputs import OutputKeys +from ....utils.constant import Tasks + +__all__ = ['PLUGForTextGeneration'] + +@MODELS.register_module(Tasks.text_generation, module_name=Models.plug) +class PlugForTextGeneration(TorchModel): + def __init__(self, model_dir: str, *args, **kwargs): + super().__init__(model_dir, *args, **kwargs) + import torch + + from transformers import BertTokenizer + from multiprocessing import Pool + from .arguments import get_args + from . import PlugNLGConfig + + self.tokenizer = BertTokenizer.from_pretrained(model_dir) + model_config = PlugNLGConfig.from_pretrained(model_dir) + + # TODO(suluyan): Arguments + args = get_args() + args.world_size = 8 + args.model_parallel_size = 8 + args.pre_load = True + args.distributed_backend = 'nccl' + args.fp16 = True + args.fp32_layernorm = True + args.checkpoint_activations = True + args.batch_size = 1 + args.top_k = 20 + args.top_p = 0.0 + args.temperature = 0.9 + self.args = args + + self.world_size = args.world_size + ranks = list(range(self.world_size)) + self.model_pool = Pool(self.world_size) + self.model_pool.map(partial(DistributedPlug.init, model_dir=model_dir, model_config=model_config, args=args), ranks) + + def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + return self.model(**input) + + def generate(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + dec_input_ids = torch.full([self.args.batch_size, 1], self.tokenizer.cls_token_id, dtype=torch.long) + input["dec_input_ids"] = dec_input_ids + res = self.model_pool.map(DistributedPlug.forward, [input]*self.world_size) + return res[0] + + diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index b1d82557..58f2ba1c 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -153,7 +153,7 @@ class Pipeline(ABC): if self.device_name == 'gpu': device = create_device() if device.type == 'gpu': - torch.cuda.set_device(device) + pass #torch.cuda.set_device(device) yield else: yield diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index 3d27ffa9..58f07d4c 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -8,6 +8,7 @@ from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import TextGenerationPreprocessor from modelscope.utils.constant import Tasks +from modelscope.outputs import OutputKeys __all__ = ['TextGenerationPipeline'] @@ -56,6 +57,7 @@ class TextGenerationPipeline(Pipeline): sequence_length=kwargs.pop('sequence_length', 128)) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.tokenizer = preprocessor.tokenizer def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: @@ -72,4 +74,6 @@ class TextGenerationPipeline(Pipeline): Returns: Dict[str, str]: the prediction results """ - return inputs + generate_context = inputs["generate_context"] + generate_context = "".join(self.tokenizer.convert_ids_to_tokens(generate_context)).replace('[UNK]', '“').replace('##','') + return {OutputKeys.TEXT: generate_context} diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index f231df9a..017bcef8 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -161,7 +161,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor): """ model_type = get_model_type(model_dir) - if model_type in (Models.structbert, Models.gpt3, Models.palm): + if model_type in (Models.structbert, Models.gpt3, Models.palm, Models.plug): from modelscope.models.nlp.structbert import SbertTokenizer return SbertTokenizer.from_pretrained(model_dir, use_fast=False) elif model_type == Models.veco: diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py new file mode 100755 index 00000000..8403544d --- /dev/null +++ b/modelscope/utils/nlp/distributed.py @@ -0,0 +1,109 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +import torch.distributed as dist +from torch.nn.modules import Module +from torch.autograd import Variable +from sofa.utils import mpu + +class DistributedDataParallel(Module): + + def __init__(self, module): + super(DistributedDataParallel, self).__init__() + self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False + + self.module = module + self.data_parallel_group = mpu.get_data_parallel_group() + src_rank = mpu.get_model_parallel_rank() + for p in self.module.parameters(): + if torch.is_tensor(p): + dist.broadcast(p, src_rank, group=self.data_parallel_group) + + def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): + if(self.needs_reduction): + self.needs_reduction = False + buckets = {} + for name, param in self.module.named_parameters(): + if param.requires_grad and param.grad is not None: + tp = (param.data.type()) + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(param) + if self.warn_on_half: + if torch.cuda.HalfTensor in buckets: + print("WARNING: gloo dist backend for half parameters may be extremely slow." + + " It is recommended to use the NCCL backend in this case.") + self.warn_on_half = False + for tp in buckets: + bucket = buckets[tp] + grads = [param.grad.data for param in bucket] + coalesced = _flatten_dense_tensors(grads) + if fp32_allreduce: + coalesced = coalesced.float() + if not no_scale and not reduce_after: + coalesced /= dist.get_world_size(group=self.data_parallel_group) + dist.all_reduce(coalesced, group=self.data_parallel_group) + torch.cuda.synchronize() + if not no_scale and reduce_after: + coalesced /= dist.get_world_size(group=self.data_parallel_group) + for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): + buf.copy_(synced) + self.hook_handles = [] + self.hooks = [] + for param in list(self.module.parameters()): + def allreduce_hook(*unused): + Variable._execution_engine.queue_callback(allreduce_params) + # handle = param.register_hook(allreduce_hook) + #self.hooks.append(allreduce_hook) + #self.hook_handles.append(handle) + self.allreduce_params = allreduce_params + + def forward(self, *inputs, **kwargs): + self.needs_reduction = True + return self.module(*inputs, **kwargs) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + #[h.remove() for h in self.hook_handles] + sd = self.module.state_dict(destination, prefix, keep_vars) + # for handle, hook in zip(self.hook_handles, self.hooks): + # d = handle.hooks_dict_ref() + # d[handle.id] = hook + + return sd + + def load_state_dict(self, state_dict, strict=True): + self.module.load_state_dict(state_dict, strict=strict) + + ''' + def _sync_buffers(self): + buffers = list(self.module._all_buffers()) + if len(buffers) > 0: + # cross-node buffer sync + flat_buffers = _flatten_dense_tensors(buffers) + dist.broadcast(flat_buffers, 0) + for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): + buf.copy_(synced) + def train(self, mode=True): + # Clear NCCL communicator and CUDA event cache of the default group ID, + # These cache will be recreated at the later call. This is currently a + # work-around for a potential NCCL deadlock. + if dist._backend == dist.dist_backend.NCCL: + dist._clear_group_cache() + super(DistributedDataParallel, self).train(mode) + self.module.train(mode) + ''' + diff --git a/modelscope/utils/nlp/fp16/__init__.py b/modelscope/utils/nlp/fp16/__init__.py new file mode 100755 index 00000000..a2c68a1f --- /dev/null +++ b/modelscope/utils/nlp/fp16/__init__.py @@ -0,0 +1,30 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .fp16util import ( + BN_convert_float, + network_to_half, + prep_param_lists, + model_grads_to_master_grads, + master_params_to_model_params, + tofp16, + to_python_float, + clip_grad_norm, + convert_module, + convert_network, + FP16Model, +) + +from .fp16 import * +from .loss_scaler import * diff --git a/modelscope/utils/nlp/fp16/fp16.py b/modelscope/utils/nlp/fp16/fp16.py new file mode 100755 index 00000000..c1c6af57 --- /dev/null +++ b/modelscope/utils/nlp/fp16/fp16.py @@ -0,0 +1,629 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Stable version of apex FP16 Optimizer""" +import torch +from torch import nn +from torch.autograd import Variable +from torch.nn.parameter import Parameter +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + +from .loss_scaler import DynamicLossScaler, LossScaler +from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm + +FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) +HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) + +def conversion_helper(val, conversion): + """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure.""" + if not isinstance(val, (tuple, list)): + return conversion(val) + rtn = [conversion_helper(v, conversion) for v in val] + if isinstance(val, tuple): + rtn = tuple(rtn) + return rtn + +def fp32_to_fp16(val): + """Convert fp32 `val` to fp16""" + def half_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, FLOAT_TYPES): + val = val.half() + return val + return conversion_helper(val, half_conversion) + +def fp16_to_fp32(val): + """Convert fp16 `val` to fp32""" + def float_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, HALF_TYPES): + val = val.float() + return val + return conversion_helper(val, float_conversion) + +class FP16_Module(nn.Module): + def __init__(self, module): + super(FP16_Module, self).__init__() + self.add_module('module', module.half()) + + def forward(self, *inputs, **kwargs): + return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs)) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + return self.module.state_dict(destination, prefix, keep_vars) + + def load_state_dict(self, state_dict, strict=True): + self.module.load_state_dict(state_dict, strict=strict) + +# TODO: Update overflow check + downscale to use Carl's fused kernel. +class FP16_Optimizer(object): + """ + :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, + and manage static or dynamic loss scaling and master weights in a manner transparent to the user. + For standard use, only two lines must be changed: creating the :class:`FP16_Optimizer` instance, + and changing the call to ``backward``. + + Example:: + + model = torch.nn.Linear(D_in, D_out).cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + # Name the FP16_Optimizer instance to replace the existing optimizer + # (recommended but not required): + optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) + ... + # loss.backward() becomes: + optimizer.backward(loss) + ... + + Example with dynamic loss scaling:: + + ... + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + # optional arg to control dynamic loss scaling behavior + # dynamic_loss_args={'scale_window' : 500}) + # Usually, dynamic_loss_args is not necessary. + + Args: + init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`. + static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate. + dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option. + dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. + verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. + + ``init_optimizer`` is expected to have been constructed in the ordinary way. + It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be + named to replace ``init_optimizer``, for two reasons: + First, it means that references to the same name + later in the file will not have to change. + Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to + modify ``init_optimizer``. If you do choose a unique name for the new + :class:`FP16_Optimizer` instance, you should only work with this new instance, + because the preexisting optimizer might no longer behave as expected. + + ``init_optimizer`` may be any Pytorch optimizer. + It may contain a mixture of fp16 and fp32 parameters organized into any number of + ``param_groups`` with different hyperparameters. The :class:`FP16_Optimizer` constructor will + ingest these ``param_groups`` and remember them. + + Calls to :: + + loss.backward() + + must be replaced with :: + + optimizer.backward(loss) + + because :class:`FP16_Optimizer` requires ownership of the backward pass to implement + loss scaling and copies to master gradients. + + .. note:: + Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients + are downscaled before being applied. This means that adjusting the loss scale, or using + dynamic loss scaling, should not require retuning the learning rate or any other + hyperparameters. + + + **Advanced options** + + **Closures**: :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure. + See docstring for :attr:`step`. + + **Gradient clipping**: Use :attr:`clip_master_grads`. + + **Multiple losses**: If your model accumulates gradients from multiple losses, + this can be made more efficient by supplying ``update_master_grads=False`` + to :attr:`backward`. See docstring for :attr:`backward`. + + **Manually adjusting loss scale**: The current loss scale can be retrieved or set via :: + + print(optimizer.loss_scale) + optimizer.loss_scale = new_loss_scale + + For static loss scaling, manually adjusting the loss scale over time is a reasonable + thing to do. During later epochs, gradients may become smaller, and a + higher loss scale may be required, analogous to scheduling the learning rate. Dynamic loss + scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting + the loss scale is not recommended. + + **Multi_GPU training**: If the wrapped ``init_optimizer`` was created from a model wrapped in + Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` + should still work as intended. + """ + + def __init__(self, + init_optimizer, + static_loss_scale=1.0, + dynamic_loss_scale=False, + dynamic_loss_args=None, + verbose=False): + if not torch.cuda.is_available: + raise SystemError("Cannot use fp16 without CUDA.") + + self.verbose = verbose + + self.optimizer = init_optimizer + # init_state_dict sets up an alternative way to cast per-param state tensors. + # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary. + # init_state_dict = init_optimizer.state_dict() + + self.fp16_groups = [] + self.fp32_from_fp16_groups = [] + self.fp32_from_fp32_groups = [] + for i, param_group in enumerate(self.optimizer.param_groups): + self.maybe_print("FP16_Optimizer processing param group {}:".format(i)) + fp16_params_this_group = [] + fp32_params_this_group = [] + fp32_from_fp16_params_this_group = [] + for i, param in enumerate(param_group['params']): + if param.requires_grad: + if param.type() == 'torch.cuda.HalfTensor': + self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}" + .format(param.size())) + fp16_params_this_group.append(param) + master_param = param.detach().clone().float() + master_param.requires_grad = True + # Copythe model parallel flag. + master_param.model_parallel = param.model_parallel + param_group['params'][i] = master_param + fp32_from_fp16_params_this_group.append(master_param) + # Reset existing state dict key to the new master param. + # We still need to recast per-param state tensors, if any, to FP32. + if param in self.optimizer.state: + self.optimizer.state[master_param] = self.optimizer.state.pop(param) + elif param.type() == 'torch.cuda.FloatTensor': + self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}" + .format(param.size())) + fp32_params_this_group.append(param) + param_group['params'][i] = param + else: + raise TypeError("Wrapped parameters must be either " + "torch.cuda.FloatTensor or torch.cuda.HalfTensor. " + "Received {}".format(param.type())) + + self.fp16_groups.append(fp16_params_this_group) + self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group) + self.fp32_from_fp32_groups.append(fp32_params_this_group) + + # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors + self.optimizer.load_state_dict(self.optimizer.state_dict()) + # alternative way to cast per-param state tensors: + # self.optimizer.load_state_dict(init_state_dict) + + if dynamic_loss_scale: + self.dynamic_loss_scale = True + if dynamic_loss_args is not None: + self.loss_scaler = DynamicLossScaler(**dynamic_loss_args) + else: + self.loss_scaler = DynamicLossScaler() + else: + self.dynamic_loss_scale = False + self.loss_scaler = LossScaler(static_loss_scale) + + self.overflow = False + self.first_closure_call_this_step = True + + self.clip_grad_norm = clip_grad_norm + + def maybe_print(self, msg): + if self.verbose: + print(msg) + + def __getstate__(self): + raise RuntimeError("FP16_Optimizer should be serialized using state_dict().") + + def __setstate__(self, state): + raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().") + + def zero_grad(self, set_grads_to_None=False): + """ + Zero fp32 and fp16 parameter grads. + """ + # In principle, only the .grad attributes of the model params need to be zeroed, + # because gradients are copied into the FP32 master params. However, we zero + # all gradients owned by the optimizer, just to be safe: + for group in self.optimizer.param_groups: + for p in group['params']: + if set_grads_to_None: + p.grad = None + else: + if p.grad is not None: + p.grad.detach_() + p.grad.zero_() + + # Zero fp16 gradients owned by the model: + for fp16_group in self.fp16_groups: + for param in fp16_group: + if set_grads_to_None: + param.grad = None + else: + if param.grad is not None: + param.grad.detach_() # as in torch.optim.optimizer.zero_grad() + param.grad.zero_() + + def _check_overflow(self): + params = [] + for group in self.fp16_groups: + for param in group: + params.append(param) + for group in self.fp32_from_fp32_groups: + for param in group: + params.append(param) + self.overflow = self.loss_scaler.has_overflow(params) + + def _update_scale(self, has_overflow=False): + self.loss_scaler.update_scale(has_overflow) + + def _master_params_to_model_params(self): + for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): + master_params_to_model_params(fp16_group, fp32_from_fp16_group) + + def _model_params_to_master_params(self): + for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): + master_params_to_model_params(fp32_from_fp16_group, fp16_group) + + # To consider: Integrate distributed with this wrapper by registering a hook on each variable + # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream. + def _model_grads_to_master_grads(self): + for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): + model_grads_to_master_grads(fp16_group, fp32_from_fp16_group) + + def _downscale_master(self): + if self.loss_scale != 1.0: + for group in self.optimizer.param_groups: + for param in group['params']: + if param.grad is not None: + param.grad.data.mul_(1./self.loss_scale) + + def clip_master_grads(self, max_norm, norm_type=2): + """ + Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``. + + Args: + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + + Returns: + Total norm of the current fp32 gradients (viewed as a single vector). + + .. warning:: + Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). + """ + if not self.overflow: + fp32_params = [] + for param_group in self.optimizer.param_groups: + for param in param_group['params']: + fp32_params.append(param) + return self.clip_grad_norm(fp32_params, max_norm, norm_type) + else: + return -1 + + def state_dict(self): + """ + Returns a dict containing the current state of this :class:`FP16_Optimizer` instance. + This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict + of the contained Pytorch optimizer. + Example:: + + checkpoint = {} + checkpoint['model'] = model.state_dict() + checkpoint['optimizer'] = optimizer.state_dict() + torch.save(checkpoint, "saved.pth") + """ + state_dict = {} + state_dict['loss_scaler'] = self.loss_scaler + state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale + state_dict['overflow'] = self.overflow + state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step + state_dict['optimizer_state_dict'] = self.optimizer.state_dict() + state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups + return state_dict + + def load_state_dict(self, state_dict): + """ + Loads a state_dict created by an earlier call to state_dict(). + If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, + whose parameters in turn came from ``model``, it is expected that the user + will call ``model.load_state_dict()`` before + ``fp16_optimizer_instance.load_state_dict()`` is called. + + Example:: + + model = torch.nn.Linear(D_in, D_out).cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) + ... + checkpoint = torch.load("saved.pth") + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + """ + # I think it should actually be ok to reload the optimizer before the model. + self.loss_scaler = state_dict['loss_scaler'] + self.dynamic_loss_scale = state_dict['dynamic_loss_scale'] + self.overflow = state_dict['overflow'] + self.first_closure_call_this_step = state_dict['first_closure_call_this_step'] + self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) + # At this point, the optimizer's references to the model's fp32 parameters are up to date. + # The optimizer's hyperparameters and internal buffers are also up to date. + # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still + # out of date. There are two options. + # 1: Refresh the master params from the model's fp16 params. + # This requires less storage but incurs precision loss. + # 2: Save and restore the fp32 master copies separately. + # We choose option 2. + # + # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device + # of their associated parameters, because it's possible those buffers might not exist yet in + # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been + # constructed in the same way as the one whose state_dict we are loading, the same master params + # are guaranteed to exist, so we can just copy_() from the saved master params. + for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']): + for current, saved in zip(current_group, saved_group): + current.data.copy_(saved.data) + + def step(self, closure=None): # could add clip option. + """ + If no closure is supplied, :attr:`step` should be called after + ``fp16_optimizer_obj.backward(loss)``. + :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to + :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params + originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run + another forward pass using their model. + + If a closure is supplied, :attr:`step` may be called without a prior call to + :attr:`backward(loss)`. + This control flow is identical to `ordinary Pytorch optimizer use`_ with closures. + However, the user should take care that any ``loss.backward()`` call within the closure + has been replaced by ``fp16_optimizer_obj.backward(loss)``. + + Args: + closure (optional): Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor. closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. + + Example with closure:: + + # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an + # existing pytorch optimizer. + for input, target in dataset: + def closure(): + optimizer.zero_grad() + output = model(input) + loss = loss_fn(output, target) + # loss.backward() becomes: + optimizer.backward(loss) + return loss + optimizer.step(closure) + + .. warning:: + Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling. + + .. _`ordinary Pytorch optimizer use`: + http://pytorch.org/docs/master/optim.html#optimizer-step-closure + """ + + scale = self.loss_scaler.loss_scale + self._update_scale(self.overflow) + + if self.overflow: + self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}" + .format(scale, self.loss_scale)) + return + + if closure is not None: + retval = self._step_with_closure(closure) + else: + retval = self.optimizer.step() + + self._master_params_to_model_params() + + return retval + + def _step_with_closure(self, closure): + def wrapped_closure(): + # helpful for debugging + # print("Calling wrapped_closure, first_closure_call_this_step = {}" + # .format(self.first_closure_call_this_step)) + if self.first_closure_call_this_step: + # We expect that the fp16 params are initially fresh on entering self.step(), + # so _master_params_to_model_params() is unnecessary the first time wrapped_closure() + # is called within self.optimizer.step(). + self.first_closure_call_this_step = False + else: + # If self.optimizer.step() internally calls wrapped_closure more than once, + # it may update the fp32 params after each call. However, self.optimizer + # doesn't know about the fp16 params at all. If the fp32 params get updated, + # we can't rely on self.optimizer to refresh the fp16 params. We need + # to handle that manually: + self._master_params_to_model_params() + # Our API expects the user to give us ownership of the backward() call by + # replacing all calls to loss.backward() with optimizer.backward(loss). + # This requirement holds whether or not the call to backward() is made within a closure. + # If the user is properly calling optimizer.backward(loss) within "closure," + # calling closure() here will give the fp32 master params fresh gradients + # for the optimizer to play with, so all wrapped_closure needs to do is call + # closure() and return the loss. + temp_loss = closure() + while(self.overflow): + scale = self.loss_scaler.loss_scale + self._update_scale(self.overflow) + self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, " + "reducing to {}".format(scale, self.loss_scale)) + temp_loss = closure() + return temp_loss + + retval = self.optimizer.step(wrapped_closure) + + self.first_closure_call_this_step = True + + return retval + + def backward(self, loss, update_master_grads=True, retain_graph=False): + """ + :attr:`backward` performs the following conceptual steps: + + 1. fp32_loss = loss.float() (see first Note below) + 2. scaled_loss = fp32_loss*loss_scale + 3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). + 4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. + 5. Finally, master grads are divided by loss_scale. + + In this way, after :attr:`backward`, the master params have fresh gradients, + and :attr:`step` may be called. + + .. note:: + :attr:`backward` internally converts the loss to fp32 before applying the loss scale. + This provides some additional safety against overflow if the user has supplied an + fp16 loss value. + However, for maximum overflow safety, the user should + compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to + :attr:`backward`. + + .. warning:: + The gradients found in a model's leaves after the call to + :attr:`backward` should not be regarded as valid in general, + because it's possible + they have been scaled (and in the case of dynamic loss scaling, + the scale factor may change over time). + If the user wants to inspect gradients after a call to :attr:`backward`, + only the master gradients should be regarded as valid. These can be retrieved via + :attr:`inspect_master_grad_data()`. + + Args: + loss: The loss output by the user's model. loss may be either float or half (but see first Note above). + update_master_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`. + retain_graph (bool, optional, default=False): Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``. If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below). + + Example:: + + # Ordinary operation: + optimizer.backward(loss) + + # Naive operation with multiple losses (technically valid, but less efficient): + # fp32 grads will be correct after the second call, but + # the first call incurs an unnecessary fp16->fp32 grad copy. + optimizer.backward(loss1) + optimizer.backward(loss2) + + # More efficient way to handle multiple losses: + # The fp16->fp32 grad copy is delayed until fp16 grads from all + # losses have been accumulated. + optimizer.backward(loss1, update_master_grads=False) + optimizer.backward(loss2, update_master_grads=False) + optimizer.update_master_grads() + """ + # To consider: try multiple backward passes using retain_grad=True to find + # a loss scale that works. After you find a loss scale that works, do a final dummy + # backward pass with retain_graph=False to tear down the graph. Doing this would avoid + # discarding the iteration, but probably wouldn't improve overall efficiency. + self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) + if update_master_grads: + self.update_master_grads() + + def update_master_grads(self): + """ + Copy the ``.grad`` attribute from stored references to fp16 parameters to + the ``.grad`` attribute of the fp32 master parameters that are directly + updated by the optimizer. :attr:`update_master_grads` only needs to be called if + ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``. + """ + if self.dynamic_loss_scale: + self._check_overflow() + if self.overflow: return + self._model_grads_to_master_grads() + self._downscale_master() + + def inspect_master_grad_data(self): + """ + When running with :class:`FP16_Optimizer`, + ``.grad`` attributes of a model's fp16 leaves should not be + regarded as truthful, because they might be scaled. + After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered, + the fp32 master params' ``.grad`` + attributes will contain valid gradients properly divided by the loss scale. However, + because :class:`FP16_Optimizer` flattens some parameters, accessing them may be + nonintuitive. :attr:`inspect_master_grad_data` + allows those gradients to be viewed with shapes corresponding to their associated model leaves. + + Returns: + List of lists (one list for each parameter group). The list for each parameter group + is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group. + """ + if self.overflow: + print("Warning: calling FP16_Optimizer.inspect_master_grad_data while in an overflow state. " + "Gradients are currently invalid (may be inf, nan, or stale). Returning None.") + return None + else: + # The optimizer owns only references to master params. + master_grads_data = [] + for param_group in self.optimizer.param_groups: + master_grads_this_group = [] + for param in param_group['params']: + if param.grad is not None: + master_grads_this_group.append(param.grad.data) + else: + master_grads_this_group.append(None) + master_grads_data.append(master_grads_this_group) + return master_grads_data + + + # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale" + def _get_loss_scale(self): + return self.loss_scaler.loss_scale + + def _set_loss_scale(self, value): + self.loss_scaler.cur_scale = value + + loss_scale = property(_get_loss_scale, _set_loss_scale) + + # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state" + def _get_state(self): + return self.optimizer.state + + def _set_state(self, value): + self.optimizer.state = value + + state = property(_get_state, _set_state) + + # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups" + # (for example, to adjust the learning rate) + def _get_param_groups(self): + return self.optimizer.param_groups + + def _set_param_groups(self, value): + self.optimizer.param_groups = value + + param_groups = property(_get_param_groups, _set_param_groups) diff --git a/modelscope/utils/nlp/fp16/fp16util.py b/modelscope/utils/nlp/fp16/fp16util.py new file mode 100755 index 00000000..08d13733 --- /dev/null +++ b/modelscope/utils/nlp/fp16/fp16util.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +from torch.autograd import Variable +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + +from sofa.utils import mpu + + +class tofp16(nn.Module): + """ + Utility module that implements:: + + def forward(self, input): + return input.half() + """ + + def __init__(self): + super(tofp16, self).__init__() + + def forward(self, input): + return input.half() + + +def BN_convert_float(module): + """ + Utility function for network_to_half(). + + Retained for legacy purposes. + """ + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: + module.float() + for child in module.children(): + BN_convert_float(child) + return module + + +def network_to_half(network): + """ + Convert model to half precision in a batchnorm-safe way. + + Retained for legacy purposes. It is recommended to use FP16Model. + """ + return nn.Sequential(tofp16(), BN_convert_float(network.half())) + + +def convert_module(module, dtype): + """ + Converts a module's immediate parameters and buffers to dtype. + """ + for param in module.parameters(recurse=False): + if param is not None: + if param.data.dtype.is_floating_point: + param.data = param.data.to(dtype=dtype) + if param._grad is not None and param._grad.data.dtype.is_floating_point: + param._grad.data = param._grad.data.to(dtype=dtype) + + for buf in module.buffers(recurse=False): + if buf is not None and buf.data.dtype.is_floating_point: + buf.data = buf.data.to(dtype=dtype) + + +def convert_network(network, dtype): + """ + Converts a network's parameters and buffers to dtype. + """ + for module in network.modules(): + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: + continue + convert_module(module, dtype) + return network + + +class FP16Model(nn.Module): + """ + Convert model to half precision in a batchnorm-safe way. + """ + + def __init__(self, network): + super(FP16Model, self).__init__() + self.network = convert_network(network, dtype=torch.half) + + def forward(self, *inputs): + inputs = tuple(t.half() for t in inputs) + return self.network(*inputs) + + +def backwards_debug_hook(grad): + raise RuntimeError("master_params recieved a gradient in the backward pass!") + +def prep_param_lists(model, flat_master=False): + """ + Creates a list of FP32 master parameters for a given model, as in + `Training Neural Networks with Mixed Precision: Real Examples`_. + + Args: + model (torch.nn.Module): Existing Pytorch model + flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. + Returns: + A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. + + Example:: + + model_params, master_params = prep_param_lists(model) + + .. warning:: + Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. + + .. _`Training Neural Networks with Mixed Precision: Real Examples`: + http://on-demand.gputechconf.com/gtc/2018/video/S81012/ + """ + model_params = [param for param in model.parameters() if param.requires_grad] + + if flat_master: + # Give the user some more useful error messages + try: + # flatten_dense_tensors returns a contiguous flat array. + # http://pytorch.org/docs/master/_modules/torch/_utils.html + master_params = _flatten_dense_tensors([param.data for param in model_params]).float() + except: + print("Error in prep_param_lists: model may contain a mixture of parameters " + "of different types. Use flat_master=False, or use F16_Optimizer.") + raise + master_params = torch.nn.Parameter(master_params) + master_params.requires_grad = True + # master_params.register_hook(backwards_debug_hook) + if master_params.grad is None: + master_params.grad = master_params.new(*master_params.size()) + return model_params, [master_params] + else: + master_params = [param.clone().float().detach() for param in model_params] + for param in master_params: + param.requires_grad = True + return model_params, master_params + + +def model_grads_to_master_grads(model_params, master_params, flat_master=False): + """ + Copy model gradients to master gradients. + + Args: + model_params: List of model parameters created by :func:`prep_param_lists`. + master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. + """ + if flat_master: + # The flattening may incur one more deep copy than is necessary. + master_params[0].grad.data.copy_( + _flatten_dense_tensors([p.grad.data for p in model_params])) + else: + for model, master in zip(model_params, master_params): + if model.grad is not None: + if master.grad is None: + master.grad = Variable(master.data.new(*master.data.size())) + master.grad.data.copy_(model.grad.data) + else: + master.grad = None + + +def master_params_to_model_params(model_params, master_params, flat_master=False): + """ + Copy master parameters to model parameters. + + Args: + model_params: List of model parameters created by :func:`prep_param_lists`. + master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. + """ + if flat_master: + for model, master in zip(model_params, + _unflatten_dense_tensors(master_params[0].data, model_params)): + model.data.copy_(master) + else: + for model, master in zip(model_params, master_params): + model.data.copy_(master.data) + +# Backward compatibility fixes + +def to_python_float(t): + if hasattr(t, 'item'): + return t.item() + else: + return t[0] + +TORCH_MAJOR = int(torch.__version__.split('.')[0]) +TORCH_MINOR = int(torch.__version__.split('.')[1]) + +clip_grad_norm = mpu.clip_grad_norm +#elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4: +# clip_grad_norm = torch.nn.utils.clip_grad_norm +#else: +# clip_grad_norm = torch.nn.utils.clip_grad_norm_ diff --git a/modelscope/utils/nlp/fp16/loss_scaler.py b/modelscope/utils/nlp/fp16/loss_scaler.py new file mode 100755 index 00000000..e92d8c03 --- /dev/null +++ b/modelscope/utils/nlp/fp16/loss_scaler.py @@ -0,0 +1,237 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from sofa.utils import mpu + +# item() is a recent addition, so this helps with backward compatibility. +def to_python_float(t): + if hasattr(t, 'item'): + return t.item() + else: + return t[0] + +class LossScaler: + """ + Class that manages a static loss scale. This class is intended to interact with + :class:`FP16_Optimizer`, and should not be directly manipulated by the user. + + Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to + :class:`FP16_Optimizer`'s constructor. + + Args: + scale (float, optional, default=1.0): The loss scale. + """ + + def __init__(self, scale=1): + self.cur_scale = scale + + # `params` is a list / generator of torch.Variable + def has_overflow(self, params): + return False + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + return False + + def update_scale(self, overflow): + pass + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss, retain_graph=False): + scaled_loss = loss*self.loss_scale + scaled_loss.backward(retain_graph=retain_graph) + +class DynamicLossScaler: + """ + Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` + indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of + :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` + operates, because the default options can be changed using the + the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. + + Loss scaling is designed to combat the problem of underflowing gradients encountered at long + times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss + scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are + encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has + occurred. + :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, + and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. + If a certain number of iterations occur without overflowing gradients detected, + :class:`DynamicLossScaler` increases the loss scale once more. + In this way :class:`DynamicLossScaler` attempts to "ride the edge" of + always using the highest loss scale possible without incurring overflow. + + Args: + init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` + scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. + scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. + """ + + def __init__(self, + init_scale=2**32, + scale_factor=2., + scale_window=1000, + min_scale=1, + delayed_shift=1, + consecutive_hysteresis=False): + self.cur_scale = init_scale + self.cur_iter = 0 + self.last_overflow_iter = -1 + self.scale_factor = scale_factor + self.scale_window = scale_window + self.min_scale = min_scale + self.delayed_shift = delayed_shift + self.cur_hysteresis = delayed_shift + self.consecutive_hysteresis = consecutive_hysteresis + + # `params` is a list / generator of torch.Variable + def has_overflow_serial(self, params): + for p in params: + if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): + return True + + return False + + def has_overflow(self, params): + overflow = self.has_overflow_serial(params) + # Since each model parallel GPU carries only part of the model, + # make sure overflow flag is synced across all the model parallel GPUs + overflow_gpu = torch.cuda.ByteTensor([overflow]) + torch.distributed.all_reduce(overflow_gpu, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_model_parallel_group()) + overflow = overflow_gpu[0].item() + return bool(overflow) + + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + try: + # if x is half, the .float() incurs an additional deep copy, but it's necessary if + # Pytorch's .sum() creates a one-element tensor of the same type as x + # (which is true for some recent version of pytorch). + cpu_sum = float(x.float().sum()) + # More efficient version that can be used if .sum() returns a Python scalar + # cpu_sum = float(x.sum()) + except RuntimeError as instance: + # We want to check if inst is actually an overflow exception. + # RuntimeError could come from a different error. + # If so, we still want the exception to propagate. + if "value cannot be converted" not in instance.args[0]: + raise + return True + else: + if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: + return True + return False + + # `overflow` is boolean indicating whether the gradient overflowed + def update_scale(self, overflow): + + if not hasattr(self, 'min_scale'): + self.min_scale = 1 + if not hasattr(self, 'delayed_shift'): + self.delayed_shift = 1 + if not hasattr(self, 'cur_hysteresis'): + self.cur_hysteresis = 1 + if not hasattr(self, 'consecutive_hysteresis'): + self.consecutive_hysteresis = True + if overflow: + # self.cur_scale /= self.scale_factor + if self.delayed_shift == 1 or self.cur_hysteresis == 1: + self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale) + else: + self.cur_hysteresis -= 1 + self.last_overflow_iter = self.cur_iter + else: + if self.consecutive_hysteresis: + self.cur_hysteresis = self.delayed_shift + if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: + if not self.consecutive_hysteresis: + self.cur_hysteresis = self.delayed_shift + self.cur_scale *= self.scale_factor + self.cur_iter += 1 + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss, retain_graph=False): + scaled_loss = loss*self.loss_scale + scaled_loss.backward(retain_graph=retain_graph) + +############################################################## +# Example usage below here -- assuming it's in a separate file +############################################################## +""" +TO-DO separate out into an example. +if __name__ == "__main__": + import torch + from torch.autograd import Variable + from dynamic_loss_scaler import DynamicLossScaler + + # N is batch size; D_in is input dimension; + # H is hidden dimension; D_out is output dimension. + N, D_in, H, D_out = 64, 1000, 100, 10 + + # Create random Tensors to hold inputs and outputs, and wrap them in Variables. + x = Variable(torch.randn(N, D_in), requires_grad=False) + y = Variable(torch.randn(N, D_out), requires_grad=False) + + w1 = Variable(torch.randn(D_in, H), requires_grad=True) + w2 = Variable(torch.randn(H, D_out), requires_grad=True) + parameters = [w1, w2] + + learning_rate = 1e-6 + optimizer = torch.optim.SGD(parameters, lr=learning_rate) + loss_scaler = DynamicLossScaler() + + for t in range(500): + y_pred = x.mm(w1).clamp(min=0).mm(w2) + loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale + print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) + print('Iter {} scaled loss: {}'.format(t, loss.data[0])) + print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) + + # Run backprop + optimizer.zero_grad() + loss.backward() + + # Check for overflow + has_overflow = DynamicLossScaler.has_overflow(parameters) + + # If no overflow, unscale grad and update as usual + if not has_overflow: + for param in parameters: + param.grad.data.mul_(1. / loss_scaler.loss_scale) + optimizer.step() + # Otherwise, don't do anything -- ie, skip iteration + else: + print('OVERFLOW!') + + # Update loss scale for next iteration + loss_scaler.update_scale(has_overflow) + +""" diff --git a/modelscope/utils/nlp/load_checkpoint.py b/modelscope/utils/nlp/load_checkpoint.py new file mode 100755 index 00000000..e92e0974 --- /dev/null +++ b/modelscope/utils/nlp/load_checkpoint.py @@ -0,0 +1,102 @@ +import os +import torch + +def load_checkpoint(model, + load_dir, + tag, + load_module_strict=True, + load_optimizer_states=True, + load_lr_scheduler_states=True): + r"""Load training checkpoint + + Arguments: + load_dir: Required. Directory to load the checkpoint from + tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step. + load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and checkpoint match. + load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance + load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint. + Return: + load_path: Path of the loaded checkpoint. None if loading the checkpoint failed + client_state: State dictionary used for loading required training states in the client code. + """ + + load_path, client_states = _load_checkpoint(model, + load_dir, + tag, + load_module_strict=load_module_strict, + load_optimizer_states=load_optimizer_states, + load_lr_scheduler_states=load_lr_scheduler_states) + + if load_optimizer_states: + if model.zero_optimization() and load_path is not None: + model._load_zero_checkpoint(load_dir, + tag, + load_optimizer_states=load_optimizer_states) + + return load_path, client_states + +def _get_ckpt_name(mpu, checkpoints_path, tag): + mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank() + ckpt_name = os.path.join(checkpoints_path, + str(tag), + 'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt') + return ckpt_name + +def pre_load(mpu, + load_dir, + tag=''): + load_path = _get_ckpt_name(mpu, load_dir, tag) + checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage) + return checkpoint['module'] + +def _load_checkpoint(model, + load_dir, + tag, + load_module_strict=True, + load_optimizer_states=True, + load_lr_scheduler_states=True): + + load_path = model._get_ckpt_name(load_dir, tag) + + if not os.path.exists(load_path): + return None, None + + checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage) + + model.load_module_state_dict(state_dict=checkpoint['module'], + strict=load_module_strict) + if not model.zero_optimization() and load_optimizer_states: + if model.fp16_enabled(): + model.optimizer.load_state_dict( + checkpoint['optimizer'], + load_optimizer_states=load_optimizer_states) + elif load_optimizer_states: + model.optimizer.load_state_dict(checkpoint['optimizer']) + + if load_lr_scheduler_states and model.lr_scheduler is not None: + model.lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + + model.csr_tensor_module_names = checkpoint['csr_tensor_module_names'] + model.global_steps = checkpoint['global_steps'] + model.global_samples = checkpoint.get('global_samples', + model.global_steps * model.train_batch_size()) + model.skipped_steps = checkpoint['skipped_steps'] + model.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size'] + model.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size'] + deepspeed_states = [ + 'module', + 'optimizer', + 'lr_scheduler', + 'csr_tensor_module_names', + 'skipped_steps', + 'global_steps', + 'dp_world_size', + 'mp_world_size' + ] + client_state = { + key: value + for key, + value in checkpoint.items() if not key in deepspeed_states + } + + return load_path, client_state diff --git a/modelscope/utils/nlp/mpu/__init__.py b/modelscope/utils/nlp/mpu/__init__.py new file mode 100755 index 00000000..e150e0c7 --- /dev/null +++ b/modelscope/utils/nlp/mpu/__init__.py @@ -0,0 +1,55 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Model parallel utility interface.""" + +from .cross_entropy import vocab_parallel_cross_entropy + +from .data import broadcast_data + +from .grads import clip_grad_norm + +from .initialize import destroy_model_parallel +from .initialize import get_data_parallel_group +from .initialize import get_data_parallel_rank +from .initialize import get_data_parallel_world_size +from .initialize import get_model_parallel_group +from .initialize import get_model_parallel_rank +from .initialize import get_model_parallel_src_rank +from .initialize import get_model_parallel_world_size +from .initialize import initialize_model_parallel +from .initialize import model_parallel_is_initialized + +from .layers import ColumnParallelLinear +from .layers import ParallelEmbedding +from .layers import RowParallelLinear +from .layers import VocabParallelEmbedding + +from .mappings import copy_to_model_parallel_region +from .mappings import gather_from_model_parallel_region +from .mappings import reduce_from_model_parallel_region +from .mappings import scatter_to_model_parallel_region +from .mappings import _gather + +from .random import checkpoint +from .random import partition_activations_in_checkpoint +from .random import get_cuda_rng_tracker +from .random import model_parallel_cuda_manual_seed + +from .transformer import BertParallelSelfAttention +from .transformer import BertParallelTransformerLayer +from .transformer import GPT2ParallelTransformer, GPT2ParallelSelfAttention +from .transformer import PalmParallelCrossAttention +from .transformer import LayerNorm diff --git a/modelscope/utils/nlp/mpu/binarizer.py b/modelscope/utils/nlp/mpu/binarizer.py new file mode 100644 index 00000000..2bdebd1b --- /dev/null +++ b/modelscope/utils/nlp/mpu/binarizer.py @@ -0,0 +1,209 @@ +# coding=utf-8 +# Copyright 2020-present, AllenAI Authors, University of Illinois Urbana-Champaign, +# Intel Nervana Systems and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Binarizers take a (real value) matrix as input and produce a binary (values in {0,1}) mask of the same shape. +""" + +import torch +from torch import autograd + + +class ThresholdBinarizer(autograd.Function): + """ + Thresholdd binarizer. + Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j} > \tau` + where `\tau` is a real value threshold. + + Implementation is inspired from: + https://github.com/arunmallya/piggyback + Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights + Arun Mallya, Dillon Davis, Svetlana Lazebnik + """ + + @staticmethod + def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool): + """ + Args: + inputs (`torch.FloatTensor`) + The input matrix from which the binarizer computes the binary mask. + threshold (`float`) + The threshold value (in R). + sigmoid (`bool`) + If set to ``True``, we apply the sigmoid function to the `inputs` matrix before comparing to `threshold`. + In this case, `threshold` should be a value between 0 and 1. + Returns: + mask (`torch.FloatTensor`) + Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is + retained, 0 - the associated weight is pruned). + """ + nb_elems = inputs.numel() + nb_min = int(0.005 * nb_elems) + 1 + if sigmoid: + mask = (torch.sigmoid(inputs) > threshold).type(inputs.type()).bool() + else: + mask = (inputs > threshold).type(inputs.type()).bool() + if mask.sum() < nb_min: + # We limit the pruning so that at least 0.5% (half a percent) of the weights are remaining + k_threshold = inputs.flatten().kthvalue(max(nb_elems - nb_min, 1)).values + mask = (inputs > k_threshold).type(inputs.type()).bool() + return mask + + @staticmethod + def backward(ctx, gradOutput): + return gradOutput, None, None + + +class TopKBinarizer(autograd.Function): + """ + Top-k Binarizer. + Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}` + is among the k% highest values of S. + + Implementation is inspired from: + https://github.com/allenai/hidden-networks + What's hidden in a randomly weighted neural network? + Vivek Ramanujan*, Mitchell Wortsman*, Aniruddha Kembhavi, Ali Farhadi, Mohammad Rastegari + """ + + @staticmethod + def forward(ctx, inputs: torch.tensor, threshold: float, k_threshold=None): + """ + Args: + inputs (`torch.FloatTensor`) + The input matrix from which the binarizer computes the binary mask. + threshold (`float`) + The percentage of weights to keep (the rest is pruned). + `threshold` is a float between 0 and 1. + Returns: + mask (`torch.FloatTensor`) + Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is + retained, 0 - the associated weight is pruned). + """ + # Get the subnetwork by sorting the inputs and using the top threshold % + if k_threshold is None: + mask = inputs.clone() + _, idx = inputs.flatten().sort(descending=True) + j = int(threshold * inputs.numel()) + # flat_out and mask access the same memory. + flat_out = mask.flatten() + flat_out[idx[j:]] = 0 + flat_out[idx[:j]] = 1 + + # if threshold == 1: + # k_threshold = -1000 + # else: + # n = inputs.numel() + # kth = min(max(n - (int(n * threshold) + 1), 1), n) + # k_threshold = inputs.flatten().kthvalue(kth).values + # mask = (inputs > k_threshold).type(inputs.type()) + else: + if threshold == 1.0: + mask = (inputs > -1000).type(inputs.type()) + else: + mask = (inputs > k_threshold).type(inputs.type()) + + # # Get the subnetwork by get the kthvalue + # # ==> This method will cause bug since if all the mask_scores are the same, the mask is all zero. + # n = inputs.numel() + # kth = max(n - (int(n * threshold) + 1), 1) + # k_threshold = inputs.flatten().kthvalue(kth).values + # mask = (inputs > k_threshold).type(inputs.type()) + + # if torch.distributed.get_rank() == 0: + # print("inputs:") + # print(inputs, flush=True) + # print('inputs isinf:') + # print(torch.isinf(inputs), flush=True) + # print('inputs isinf number:') + # print(torch.isinf(inputs).sum(), flush=True) + # + # print('\n\n\nMask:') + # print(mask, flush=True) + # print('Mask isinf:') + # print(torch.isinf(mask), flush=True) + # print('Mask isinf number:') + # print(torch.isinf(mask).sum(), flush=True) + # print('Mask sum:') + # print(torch.sum(mask), flush=True) + # + # print('inputs (mask_scores).mean(): ', inputs.mean().detach().cpu(), flush=True) + # print('inputs (mask_scores).max(): ', inputs.max().detach().cpu(), flush=True) + # print('inputs (mask_scores).min(): ', inputs.min().detach().cpu(), flush=True) + # print('inputs is all 0?', (inputs != torch.tensor(0).type(inputs.type())).sum().detach().cpu().numpy()) + # print("\n\n\nMask ratio: {}/{}={}".format(mask.sum().detach().cpu(), inputs.numel(), (mask.sum().detach().cpu().numpy()) / float(inputs.numel())), flush=True) + # print("threshold: {}".format(threshold), flush=True) + + return mask + + @staticmethod + def backward(ctx, gradOutput): + return gradOutput, None, None + + +class MagnitudeBinarizer(object): + """ + Magnitude Binarizer. + Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}` + is among the k% highest values of |S| (absolute value). + + Implementation is inspired from https://github.com/NervanaSystems/distiller/blob/2291fdcc2ea642a98d4e20629acb5a9e2e04b4e6/distiller/pruning/automated_gradual_pruner.py#L24 + """ + + @staticmethod + def apply(inputs: torch.tensor, threshold: float): + """ + Args: + inputs (`torch.FloatTensor`) + The input matrix from which the binarizer computes the binary mask. + This input marix is typically the weight matrix. + threshold (`float`) + The percentage of weights to keep (the rest is pruned). + `threshold` is a float between 0 and 1. + Returns: + mask (`torch.FloatTensor`) + Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is + retained, 0 - the associated weight is pruned). + """ + # Get the subnetwork by sorting the inputs and using the top threshold % + mask = inputs.clone() + _, idx = inputs.abs().flatten().sort(descending=True) + j = int(threshold * inputs.numel()) + # flat_out and mask access the same memory. + flat_out = mask.flatten() + flat_out[idx[j:]] = 0 + flat_out[idx[:j]] = 1 + # mask = mask.bool() + return mask + + # # Get the subnetwork by sorting the inputs and using the top threshold + # # ==> This method will cause bug since if all the mask_scores are the same, the mask is all zero. + # n = inputs.numel() + # kth = max(n - (int(n * threshold) + 1), 1) + # k_threshold = inputs.abs().flatten().kthvalue(kth).values + # mask = (inputs > k_threshold).type(inputs.type()) + # return mask + +class MaskTaylor(autograd.Function): + @staticmethod + def forward(ctx, weight, mask): + ctx.save_for_backward(weight, mask) + return mask*weight + + @staticmethod + def backward(ctx, gradOutput): + weight, mask, = ctx.saved_tensors + return gradOutput*mask, -torch.pow(gradOutput*weight, 2) + # return gradOutput*mask, -torch.abs(gradOutput*weight) diff --git a/modelscope/utils/nlp/mpu/cross_entropy.py b/modelscope/utils/nlp/mpu/cross_entropy.py new file mode 100755 index 00000000..845f0441 --- /dev/null +++ b/modelscope/utils/nlp/mpu/cross_entropy.py @@ -0,0 +1,109 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch + +from .initialize import get_model_parallel_group +from .initialize import get_model_parallel_rank +from .initialize import get_model_parallel_world_size +from .utils import VocabUtility + + +class _VocabParallelCrossEntropy(torch.autograd.Function): + + @staticmethod + def forward(ctx, vocab_parallel_logits, target): + + # Copy so the input remains unchanged. + logits = vocab_parallel_logits.clone() + # Maximum value along vocab dimension across all GPUs. + logits_max = torch.max(logits, dim=-1)[0] + torch.distributed.all_reduce(logits_max, + op=torch.distributed.ReduceOp.MAX, + group=get_model_parallel_group()) + # Subtract the maximum value. + logits.sub_(logits_max.unsqueeze(dim=-1)) + # Sum of exponential of logits along vocab dimension across all GPUs. + exp_logits = logits.exp() + sum_exp_logits = exp_logits.sum(dim=-1) + torch.distributed.all_reduce(sum_exp_logits, + op=torch.distributed.ReduceOp.SUM, + group=get_model_parallel_group()) + + # Get the partition's vocab indecies + get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size + partition_vocab_size = vocab_parallel_logits.size()[-1] + rank = get_model_parallel_rank() + world_size = get_model_parallel_world_size() + vocab_start_index, vocab_end_index = get_vocab_range( + partition_vocab_size, rank, world_size) + + # Create a mask of valid vocab ids (1 means it needs to be masked). + target_mask = (target < vocab_start_index) | (target >= vocab_end_index) + masked_target = target.clone() - vocab_start_index + masked_target[target_mask] = 0 + + # Get predicted-logits = logits[target]. + # For Simplicity, we convert logits to a 2-D tensor with size + # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. + logits_2d = logits.view(-1, partition_vocab_size) + masked_target_1d = masked_target.view(-1) + arange_1d = torch.arange(start=0, end=logits_2d.size()[0], + device=logits_2d.device) + predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] + predicted_logits = predicted_logits_1d.view_as(target) + predicted_logits[target_mask] = 0.0 + # All reduce is needed to get the chunks from other GPUs. + torch.distributed.all_reduce(predicted_logits, + op=torch.distributed.ReduceOp.SUM, + group=get_model_parallel_group()) + + # Loss = log(sum(exp(logits))) - predicted-logit. + loss = torch.log(sum_exp_logits) - predicted_logits + + # Store softmax, target-mask and masked-target for backward pass. + exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) + ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) + + return loss + + @staticmethod + def backward(ctx, grad_output): + + # Retreive tensors from the forward path. + softmax, target_mask, masked_target_1d = ctx.saved_tensors + + # All the inputs have softmax as thier gradient. + grad_input = softmax + # For simplicity, work with the 2D gradient. + partition_vocab_size = softmax.size()[-1] + grad_2d = grad_input.view(-1, partition_vocab_size) + + # Add the gradient from matching classes. + arange_1d = torch.arange(start=0, end=grad_2d.size()[0], + device=grad_2d.device) + grad_2d[arange_1d, masked_target_1d] -= ( + 1.0 - target_mask.view(-1).float()) + + # Finally elementwise multiplication with the output gradients. + grad_input.mul_(grad_output.unsqueeze(dim=-1)) + + return grad_input, None + + +def vocab_parallel_cross_entropy(vocab_parallel_logits, target): + """Helper function for the cross entropy.""" + return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) diff --git a/modelscope/utils/nlp/mpu/data.py b/modelscope/utils/nlp/mpu/data.py new file mode 100755 index 00000000..5008a3a1 --- /dev/null +++ b/modelscope/utils/nlp/mpu/data.py @@ -0,0 +1,116 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from .initialize import get_model_parallel_group +from .initialize import get_model_parallel_rank +from .initialize import get_model_parallel_src_rank + + +_MAX_DATA_DIM = 5 + + +def _check_data_types(keys, data, target_dtype): + """Check that all the keys have the same target data type.""" + for key in keys: + assert data[key].dtype == target_dtype, '{} has data type {} which '\ + 'is different than {}'.format(key, data[key].dtype, target_dtype) + + +def _build_key_size_numel_dictionaries(keys, data): + """Build the size on rank 0 and broadcast.""" + max_dim = _MAX_DATA_DIM + sizes = [0 for _ in range(max_dim) for _ in keys] + + # Pack the sizes on rank zero. + if get_model_parallel_rank() == 0: + offset = 0 + for key in keys: + assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM' + size = data[key].size() + for i, s in enumerate(size): + sizes[i + offset] = s + offset += max_dim + + # Move to GPU and broadcast. + sizes_cuda = torch.cuda.LongTensor(sizes) + torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(), + group=get_model_parallel_group()) + + # Move back to cpu and unpack. + sizes_cpu = sizes_cuda.cpu() + key_size = {} + key_numel = {} + total_numel = 0 + offset = 0 + for key in keys: + i = 0 + size = [] + numel = 1 + while sizes_cpu[offset + i] > 0: + this_size = sizes_cpu[offset + i] + size.append(this_size) + numel *= this_size + i += 1 + key_size[key] = size + key_numel[key] = numel + total_numel += numel + offset += max_dim + + return key_size, key_numel, total_numel + + +def broadcast_data(keys, data, datatype): + """Broadcast data from rank zero of each model parallel group to the + members of the same model parallel group. + + Arguments: + keys: list of keys in the data disctionary to be broadcasted + data: data dictionary of string keys and cpu tensor values. + datatype: torch data type of all tensors in data associated + with keys. + """ + # Build (key, size) and (key, number of elements) dictionaries along + # with the total number of elements on all ranks. + key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, + data) + + # Pack on rank zero. + if get_model_parallel_rank() == 0: + # Check that all keys have the same data type. + _check_data_types(keys, data, datatype) + # Flatten the data associated with the keys + flatten_data = torch.cat( + [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() + else: + flatten_data = torch.empty(total_numel, + device=torch.cuda.current_device(), + dtype=datatype) + + # Boradcast + torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(), + group=get_model_parallel_group()) + + # Unpack + output = {} + offset = 0 + for key in keys: + size = key_size[key] + numel = key_numel[key] + output[key] = flatten_data.narrow(0, offset, numel).view(size) + offset += numel + + return output diff --git a/modelscope/utils/nlp/mpu/grads.py b/modelscope/utils/nlp/mpu/grads.py new file mode 100755 index 00000000..0ae9cf5d --- /dev/null +++ b/modelscope/utils/nlp/mpu/grads.py @@ -0,0 +1,74 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch + + +import torch +from torch._six import inf + +from .initialize import get_model_parallel_group +from .initialize import get_model_parallel_rank + + +def clip_grad_norm(parameters, max_norm, norm_type=2): + """Clips gradient norm of an iterable of parameters. + + This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and + added functionality to handle model parallel parameters. Note that + the gradients are modified in place. + + Arguments: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have gradients normalized + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + + Returns: + Total norm of the parameters (viewed as a single vector). + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + max_norm = float(max_norm) + norm_type = float(norm_type) + if norm_type == inf: + total_norm = max(p.grad.data.abs().max() for p in parameters) + total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) + # Take max across all GPUs. + torch.distributed.all_reduce(total_norm_cuda, + op=torch.distributed.ReduceOp.MAX, + group=get_model_parallel_group()) + total_norm = total_norm_cuda[0].item() + else: + total_norm = 0 + for p in parameters: + if p.model_parallel or (get_model_parallel_rank() == 0): + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + # Sum across all model parallel GPUs. + total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) + torch.distributed.all_reduce(total_norm_cuda, + op=torch.distributed.ReduceOp.SUM, + group=get_model_parallel_group()) + total_norm = total_norm_cuda[0].item() ** (1. / norm_type) + clip_coef = max_norm / (total_norm + 1e-6) + if clip_coef < 1: + for p in parameters: + p.grad.data.mul_(clip_coef) + return total_norm diff --git a/modelscope/utils/nlp/mpu/initialize.py b/modelscope/utils/nlp/mpu/initialize.py new file mode 100755 index 00000000..15656065 --- /dev/null +++ b/modelscope/utils/nlp/mpu/initialize.py @@ -0,0 +1,134 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Model and data parallel groups.""" + +import torch + +from .utils import ensure_divisibility + + +# Model parallel group that the current rank belongs to. +_MODEL_PARALLEL_GROUP = None +# Data parallel group that the current rank belongs to. +_DATA_PARALLEL_GROUP = None + + +def initialize_model_parallel(model_parallel_size_): + """ + Initialize model data parallel groups. + + Arguments: + model_parallel_size: number of GPUs used to parallelize model. + + Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we + use 2 GPUs to parallelize the model. The present function will + create 4 model parallel groups and 2 data parallel grous as: + 4 model parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7] + 2 data parallel groups: + [g0, g2, g4, g6], [g1, g3, g5, g7] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + """ + if torch.distributed.get_rank() == 0: + print('> initializing model parallel with size {}'.format( + model_parallel_size_)) + # Get world size and rank. Ensure some consistencies. + assert torch.distributed.is_initialized() + world_size = torch.distributed.get_world_size() + model_parallel_size = min(model_parallel_size_, world_size) + ensure_divisibility(world_size, model_parallel_size) + rank = torch.distributed.get_rank() + # Build the data parallel groups. + global _DATA_PARALLEL_GROUP + assert _DATA_PARALLEL_GROUP is None, \ + 'data parallel group is already initialized' + for i in range(model_parallel_size): + ranks = range(i, world_size, model_parallel_size) + group = torch.distributed.new_group(ranks) + if i == (rank % model_parallel_size): + _DATA_PARALLEL_GROUP = group + + # Build the model parallel groups. + global _MODEL_PARALLEL_GROUP + assert _MODEL_PARALLEL_GROUP is None, \ + 'model parallel group is already initialized' + for i in range(world_size // model_parallel_size): + ranks = range(i * model_parallel_size, + (i + 1) * model_parallel_size) + group = torch.distributed.new_group(ranks) + if i == (rank // model_parallel_size): + _MODEL_PARALLEL_GROUP = group + + +def model_parallel_is_initialized(): + """Check if model and data parallel groups are initialized.""" + if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None: + return False + return True + + +def get_model_parallel_group(): + """Get the model parallel group the caller rank belongs to.""" + assert _MODEL_PARALLEL_GROUP is not None, \ + 'model parallel group is not initialized' + return _MODEL_PARALLEL_GROUP + + +def get_data_parallel_group(): + """Get the data parallel group the caller rank belongs to.""" + assert _DATA_PARALLEL_GROUP is not None, \ + 'data parallel group is not initialized' + return _DATA_PARALLEL_GROUP + + +def get_model_parallel_world_size(): + """Return world size for the model parallel group.""" + return torch.distributed.get_world_size(group=get_model_parallel_group()) + + +def get_model_parallel_rank(): + """Return my rank for the model parallel group.""" + return torch.distributed.get_rank(group=get_model_parallel_group()) + + +def get_model_parallel_src_rank(): + """Calculate the global rank corresponding to a local rank zeor + in the model parallel group.""" + global_rank = torch.distributed.get_rank() + local_world_size = get_model_parallel_world_size() + return (global_rank // local_world_size) * local_world_size + + +def get_data_parallel_world_size(): + """Return world size for the data parallel group.""" + return torch.distributed.get_world_size(group=get_data_parallel_group()) + + +def get_data_parallel_rank(): + """Return my rank for the data parallel group.""" + return torch.distributed.get_rank(group=get_data_parallel_group()) + + +def destroy_model_parallel(): + """Set the groups to none.""" + global _MODEL_PARALLEL_GROUP + _MODEL_PARALLEL_GROUP = None + global _DATA_PARALLEL_GROUP + _DATA_PARALLEL_GROUP = None diff --git a/modelscope/utils/nlp/mpu/layers.py b/modelscope/utils/nlp/mpu/layers.py new file mode 100755 index 00000000..33247553 --- /dev/null +++ b/modelscope/utils/nlp/mpu/layers.py @@ -0,0 +1,420 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch + + +import math + +import torch +import torch.nn.functional as F +import torch.nn.init as init +from torch.nn.parameter import Parameter + +from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm + +from .initialize import get_model_parallel_rank +from .initialize import get_model_parallel_world_size +from .mappings import copy_to_model_parallel_region +from .mappings import gather_from_model_parallel_region +from .mappings import reduce_from_model_parallel_region +from .mappings import scatter_to_model_parallel_region +from .random import get_cuda_rng_tracker +from .utils import divide +from .utils import split_tensor_along_last_dim +from .utils import VocabUtility +from deepspeed.utils.timer import SynchronizedWallClockTimer + +from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer, MaskTaylor + +def _initialize_affine_weight(weight, output_size, input_size, + per_partition_size, partition_dim, init_method, + stride=1, return_master_weight=False): + """Initialize affine weight for model parallel. + + Build the master weight on all processes and scatter + the relevant chunk.""" + # If we only use 1 process for model parallelism, bypass scatter. + world_size = get_model_parallel_world_size() + if world_size == 1: + init_method(weight) + if return_master_weight: + return weight + return None + + # Initialize master weight + master_weight = torch.empty(output_size, input_size, + dtype=weight.dtype, + requires_grad=False) + init_method(master_weight) + + # Split and copy + per_partition_per_stride_size = divide(per_partition_size, stride) + weight_list = torch.split(master_weight, per_partition_per_stride_size, + dim=partition_dim) + rank = get_model_parallel_rank() + my_weight_list = weight_list[rank::world_size] + + with torch.no_grad(): + torch.cat(my_weight_list, dim=partition_dim, out=weight) + if return_master_weight: + return master_weight + return None + + +class VocabParallelEmbedding(torch.nn.Module): + """Embedding parallelized in the vocabulary dimension. + + This is mainly adapted from torch.nn.Embedding and all the default + values are kept. + Arguments: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + init_method: method to initialize weights. + """ + def __init__(self, num_embeddings, embedding_dim, + init_method=init.xavier_normal_): + super(VocabParallelEmbedding, self).__init__() + # Keep the input dimensions. + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + # Set the detauls for compatibility. + self.padding_idx = None + self.max_norm = None + self.norm_type = 2. + self.scale_grad_by_freq = False + self.sparse = False + self._weight = None + # Divide the weight matrix along the vocaburaly dimension. + self.vocab_start_index, self.vocab_end_index = \ + VocabUtility.vocab_range_from_global_vocab_size( + self.num_embeddings, get_model_parallel_rank(), + get_model_parallel_world_size()) + self.num_embeddings_per_partition = self.vocab_end_index - \ + self.vocab_start_index + + # Allocate weights. + self.weight = Parameter(torch.Tensor(self.num_embeddings_per_partition, + self.embedding_dim)) + self.weight.model_parallel = True + # And initialize. + _initialize_affine_weight( + self.weight, self.num_embeddings, self.embedding_dim, + self.num_embeddings_per_partition, 0, init_method) + self.timers = SynchronizedWallClockTimer() + + def forward(self, input_): + #self.timers('embedding').start() + # Build the mask. + input_mask = (input_ < self.vocab_start_index) | \ + (input_ >= self.vocab_end_index) + # Mask the input. + masked_input = input_.clone() - self.vocab_start_index + masked_input[input_mask] = 0 + # Get the embeddings. + output_parallel = F.embedding(masked_input, self.weight, + self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, + self.sparse) + # Mask the output embedding. + output_parallel[input_mask, :] = 0.0 + # Reduce across all the model parallel GPUs. + #self.timers('embedding').stop() + #self.timers('embedding reduce').start() + output = reduce_from_model_parallel_region(output_parallel) + #self.timers('embedding reduce').stop() + #timer_names = ['embedding', 'embedding reduce'] + #self.timers.log(names=timer_names) + return output + + +class ParallelEmbedding(torch.nn.Module): + """Embedding parallelized in the embedding dimension. + + This is mainly adapted from torch.nn.Embedding and all the default + values are kept. + Arguments: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + init_method: method to initialize weights. + """ + def __init__(self, num_embeddings, embedding_dim, + init_method=init.xavier_normal_, + keep_master_weight_for_test=False): + super(ParallelEmbedding, self).__init__() + # Keep the input dimensions. + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + # Set some detauls for compatibility. + self.padding_idx = None + self.max_norm = None + self.norm_type = 2. + self.scale_grad_by_freq = False + self.sparse = False + self._weight = None + # Divide the weight matrix along the embedding dimension. + world_size = get_model_parallel_world_size() + self.embedding_dim_per_partition = divide(self.embedding_dim, + world_size) + + # Allocate weights. + self.weight = Parameter(torch.Tensor(self.num_embeddings, + self.embedding_dim_per_partition)) + self.weight.model_parallel = True + # And initialize. + _initialize_affine_weight( + self.weight, self.num_embeddings, self.embedding_dim, + self.embedding_dim_per_partition, 1, init_method, + stride=1, return_master_weight=False) + + def forward(self, input_): + input_parallel = copy_to_model_parallel_region(input_) + output_parallel = F.embedding(input_parallel, self.weight, + self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, + self.sparse) + output = gather_from_model_parallel_region(output_parallel) + return output + + +class ColumnParallelLinear(torch.nn.Module): + """Linear layer with column parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its second dimension as A = [A_1, ..., A_p]. + + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias + gather_output: If true, call all-gether on output and make Y avaiable + to all GPUs, otherwise, every GPU will have its output + which is Y_i = XA_i + init_method: method to initialize weights. Note that bias is always set + to zero. + stride: For the strided linear layers. + keep_master_weight_for_test: This was added for testing and should be + set to False. It returns the master weights + used for initialization. + """ + def __init__(self, input_size, output_size, bias=True, gather_output=True, + init_method=init.xavier_normal_, stride=1, + keep_master_weight_for_test=False, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0, + LR_weight_rank=8, LR_mask_rank=8): + super(ColumnParallelLinear, self).__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.gather_output = gather_output + # Divide the weight matrix along the last dimension. + world_size = get_model_parallel_world_size() + self.output_size_per_partition = divide(output_size, world_size) + + # Parameters. + # Note: torch.nn.functional.linear performs XA^T + b and as a result + # we allocate the transpose. + self.weight = Parameter(torch.Tensor(self.output_size_per_partition, + self.input_size)) + self.weight.model_parallel = True + if bias: + self.bias = Parameter(torch.Tensor(self.output_size_per_partition)) + self.bias.model_parallel = True + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + else: + self.register_parameter('bias', None) + + # Initialize weight. + self.master_weight = _initialize_affine_weight( + self.weight, self.output_size, self.input_size, + self.output_size_per_partition, 0, init_method, + stride=stride, return_master_weight=keep_master_weight_for_test) + # self.timers = SynchronizedWallClockTimer() + + self.pruning_method = None + + def init_mask(self): + if self.pruning_mask_init == "constant": + init.constant_(self.mask_scores, val=self.pruning_mask_scale) + elif self.pruning_mask_init == "uniform": + init.uniform_(self.mask_scores, a=-self.pruning_mask_scale, b=self.pruning_mask_scale) + elif self.pruning_mask_init == "kaiming": + init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5)) + + def load_mask(self, pruning_threshold, k_threshold=None): + if self.pruning_method in ["finetune"]: + # mask = TopKBinarizer.apply(self.mask_scores.cpu(), pruning_threshold, k_threshold) + if k_threshold is not None: + self.mask[self.mask_scores <= k_threshold] = 0 + else: + _, idx = self.mask_scores.cpu().flatten().sort(descending=True) + j = int(pruning_threshold * self.mask_scores.numel()) + flat_out = self.mask.flatten() + flat_out[idx[j:]] = 0 + flat_out[idx[:j]] = 1 + + self.__setattr__("mask_scores", None) + del self.mask_scores + self.weight.data = self.weight.data*self.mask.data + + # mask = TopKBinarizer.apply(self.mask_scores, pruning_threshold, k_threshold) + # self.mask.data = mask.data + # self.weight.data = self.weight.data*self.mask.data + # del self.mask_scores + # self.__setattr__("mask_scores", None) + return True + else: + return False + + def forward(self, input_, pruning_threshold=None): + # Set up backprop all-reduce. + input_parallel = copy_to_model_parallel_region(input_) + + # Matrix multiply. + if hasattr(self, 'linear'): + output_parallel = self.linear(input_parallel, self.weight, self.bias) + else: + output_parallel = F.linear(input_parallel, self.weight, self.bias) + + if self.gather_output: + # All-gather across the partitions. + output = gather_from_model_parallel_region(output_parallel) + else: + output = output_parallel + return output + + +class RowParallelLinear(torch.nn.Module): + """Linear layer with row parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its first dimension and X along its second dimension as: + - - + | A_1 | + | . | + A = | . | X = [X_1, ..., X_p] + | . | + | A_p | + - - + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. Note that bias is not parallelized. + input_is_parallel: If true, we assume that the input is already + split across the GPUs and we do not split + again. + init_method: method to initialize weights. Note that bias is always set + to zero. + stride: For the strided linear layers. + keep_master_weight_for_test: This was added for testing and should be + set to False. It returns the master weights + used for initialization. + """ + def __init__(self, input_size, output_size, bias=True, + input_is_parallel=False, + init_method=init.xavier_normal_, stride=1, + keep_master_weight_for_test=False, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0, + LR_weight_rank=8, LR_mask_rank=8): + super(RowParallelLinear, self).__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.input_is_parallel = input_is_parallel + # Divide the weight matrix along the last dimension. + world_size = get_model_parallel_world_size() + self.input_size_per_partition = divide(input_size, world_size) + + # Parameters. + # Note: torch.nn.functional.linear performs XA^T + b and as a result + # we allocate the transpose. + self.weight = Parameter(torch.Tensor(self.output_size, + self.input_size_per_partition)) + self.weight.model_parallel = True + if bias: + self.bias = Parameter(torch.Tensor(self.output_size)) + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + else: + self.register_parameter('bias', None) + + # Initialize weight. + self.master_weight = _initialize_affine_weight( + self.weight, self.output_size, self.input_size, + self.input_size_per_partition, 1, init_method, + stride=stride, return_master_weight=keep_master_weight_for_test) + + self.pruning_method = None + + def init_mask(self): + if self.pruning_mask_init == "constant": + init.constant_(self.mask_scores, val=self.pruning_mask_scale) + elif self.pruning_mask_init == "uniform": + init.uniform_(self.mask_scores, a=-self.pruning_mask_scale, b=self.pruning_mask_scale) + elif self.pruning_mask_init == "kaiming": + init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5)) + + def load_mask(self, pruning_threshold, k_threshold=None): + if self.pruning_method in ["finetune"]: + if k_threshold is not None: + self.mask[self.mask_scores <= k_threshold] = 0 + else: + _, idx = self.mask_scores.cpu().flatten().sort(descending=True) + j = int(pruning_threshold * self.mask_scores.numel()) + flat_out = self.mask.flatten() + flat_out[idx[j:]] = 0 + flat_out[idx[:j]] = 1 + + self.__setattr__("mask_scores", None) + del self.mask_scores + self.weight.data = self.weight.data*self.mask.data + + # mask = TopKBinarizer.apply(self.mask_scores, pruning_threshold, k_threshold) + # self.mask.data = mask.data + # self.weight.data = self.weight.data*self.mask.data + # del self.mask_scores + # self.__setattr__("mask_scores", None) + return True + else: + return False + + def forward(self, input_, pruning_threshold=None): + # Set up backprop all-reduce. + if self.input_is_parallel: + input_parallel = input_ + else: + input_parallel = scatter_to_model_parallel_region(input_) + + # Matrix multiply. + if hasattr(self, 'linear'): + output_parallel = self.linear(input_parallel, self.weight) + else: + output_parallel = F.linear(input_parallel, self.weight) + + # All-reduce across all the partitions. + output_ = reduce_from_model_parallel_region(output_parallel) + if self.bias is not None: + output = output_ + self.bias + else: + output = output_ + return output + diff --git a/modelscope/utils/nlp/mpu/mappings.py b/modelscope/utils/nlp/mpu/mappings.py new file mode 100755 index 00000000..ab7f4a69 --- /dev/null +++ b/modelscope/utils/nlp/mpu/mappings.py @@ -0,0 +1,165 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from .initialize import get_model_parallel_group +from .utils import split_tensor_along_last_dim +from deepspeed.utils.timer import SynchronizedWallClockTimer + +def _reduce(input_): + """All-reduce the the input tensor across model parallel group.""" + group = get_model_parallel_group() + + # Bypass the function if we are using only 1 GPU. + if torch.distributed.get_world_size(group=group) == 1: + return input_ + + # All-reduce. + torch.distributed.all_reduce(input_, group=group) + + return input_ + + +def _split(input_): + """Split the tensor along its last dimension and keep the + corresponding slice.""" + group = get_model_parallel_group() + + # Bypass the function if we are using only 1 GPU. + if torch.distributed.get_world_size(group=group) == 1: + return input_ + + # Split along last dimension. + world_size = torch.distributed.get_world_size(group=group) + input_list = split_tensor_along_last_dim(input_, world_size) + + # Note: torch.split does not create contiguous tensors by default. + rank = torch.distributed.get_rank(group=group) + output = input_list[rank].contiguous() + + return output + + +def _gather(input_): + """Gather tensors and concatinate along the last dimension.""" + group = get_model_parallel_group() + + # Bypass the function if we are using only 1 GPU. + if torch.distributed.get_world_size(group=group) == 1: + return input_ + + # Size and dimension. + last_dim = input_.dim() - 1 + rank = torch.distributed.get_rank(group=group) + world_size = torch.distributed.get_world_size(group=group) + + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + tensor_list[rank] = input_ + torch.distributed.all_gather(tensor_list, input_, group=group) + + # Note: torch.cat already creates a contiguous tensor. + output = torch.cat(tensor_list, dim=last_dim).contiguous() + + return output + + +class _CopyToModelParallelRegion(torch.autograd.Function): + """Pass the input to the model parallel region.""" + + @staticmethod + def forward(ctx, input_): + return input_ + + @staticmethod + def backward(ctx, grad_output): + #timers = SynchronizedWallClockTimer() + #timers('backward _Copy reduce').start() + out = _reduce(grad_output) + #timers('backward _Copy reduce').stop() + #timers.log(names=['backward _Copy reduce']) + return out + #return _reduce(grad_output) + + +class _ReduceFromModelParallelRegion(torch.autograd.Function): + """All-redcue the input from the model parallel region.""" + + @staticmethod + def forward(ctx, input_): + #timers = SynchronizedWallClockTimer() + #timers('forward _Reduce reduce').start() + out = _reduce(input_) + #timers('forward _Reduce reduce').stop() + #timers.log(names=['forward _Reduce reduce']) + return out + #return _reduce(input_) + + @staticmethod + def backward(ctx, grad_output): + return grad_output + + +class _ScatterToModelParallelRegion(torch.autograd.Function): + """Split the input and keep only the corresponding chuck to the rank.""" + + @staticmethod + def forward(ctx, input_): + return _split(input_) + + @staticmethod + def backward(ctx, grad_output): + #timers = SynchronizedWallClockTimer() + #timers('backward _Scatter gather').start() + out = _gather(grad_output) + #timers('backward _Scatter gather').stop() + #timers.log(names=['backward _Scatter gather']) + return out + #return _gather(grad_output) + + +class _GatherFromModelParallelRegion(torch.autograd.Function): + """Gather the input from model parallel region and concatinate.""" + + @staticmethod + def forward(ctx, input_): + #timers = SynchronizedWallClockTimer() + #timers('forward _Gather gather').start() + out = _gather(input_) + #timers('forward _Gather gather').stop() + #timers.log(names=['forward _Gather gather']) + return out + #return _gather(input_) + + @staticmethod + def backward(ctx, grad_output): + return _split(grad_output) + + +# ----------------- +# Helper functions. +# ----------------- + +def copy_to_model_parallel_region(input_): + return _CopyToModelParallelRegion.apply(input_) + +def reduce_from_model_parallel_region(input_): + return _ReduceFromModelParallelRegion.apply(input_) + +def scatter_to_model_parallel_region(input_): + return _ScatterToModelParallelRegion.apply(input_) + +def gather_from_model_parallel_region(input_): + return _GatherFromModelParallelRegion.apply(input_) diff --git a/modelscope/utils/nlp/mpu/random.py b/modelscope/utils/nlp/mpu/random.py new file mode 100755 index 00000000..c6f5a946 --- /dev/null +++ b/modelscope/utils/nlp/mpu/random.py @@ -0,0 +1,391 @@ +# coding=utf-8 +#Modified by Samyam Rajbhandari +#Used to partition the activations stored for backward propagation +#Therefore reduces the memory consumption + +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch +import contextlib +import torch.distributed as dist +import torch +from torch import _C +from torch.cuda import _lazy_call, device as device_ctx_manager +#from torch.utils.checkpoint import detach_variable +from ..utils import print_rank_0 + + +import torch.distributed as dist +PARTITION_ACTIVATIONS = False +PA_CORRECTNESS_TEST= False + +def see_memory_usage(message, force=False): + if not force: + return + dist.barrier() + if dist.get_rank() == 0: + print(message) + print("Memory Allocated ", torch.cuda.memory_allocated()/(1024*1024*1024), "GigaBytes") + print("Max Memory Allocated ", torch.cuda.max_memory_allocated()/(1024*1024*1024), "GigaBytes") + print("Cache Allocated ", torch.cuda.memory_cached()/(1024*1024*1024), "GigaBytes") + print("Max cache Allocated ", torch.cuda.max_memory_cached()/(1024*1024*1024), "GigaBytes") + print(" ") + #input("Press Any Key To Continue ..") + + +from .initialize import get_data_parallel_rank +from .initialize import get_model_parallel_rank +from .initialize import get_model_parallel_world_size +from .initialize import get_model_parallel_group + +mp_rank = None #get_model_parallel_rank() +mp_size = None #get_model_parallel_world_size() +mp_group = None #get_model_parallel_group() + +# Default name for the model parallel rng tracker. +_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng' +transport_stream = None +cuda_device=None +def detach_variable(inputs, device=None): + if isinstance(inputs, tuple): + out = [] + for inp in inputs: + if not isinstance(inp, torch.Tensor): + out.append(inp) + continue + + requires_grad = inp.requires_grad + + if device is not None: + x = inp.to(device=device) + else: + x = inp + + x = x.detach() + x.requires_grad = requires_grad + out.append(x) + return tuple(out) + else: + raise RuntimeError( + "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__) + +def _set_cuda_rng_state(new_state, device=-1): + """Sets the random number generator state of the current GPU. + + Argumentss: + new_state (torch.ByteTensor): The desired state + This function is adapted from PyTorch repo (torch.cuda.set_rng_state) + with a single change: the input state is not cloned. Cloning caused + major performance issues for +4 GPU cases. + """ + if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState): + # older PyTorch + def cb(): + with device_ctx_manager(device): + _C._cuda_setRNGState(new_state) + else: + # newer PyTorch + if device == -1: + device = torch.device('cuda') + elif isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device('cuda', device) + + def cb(): + idx = device.index + if idx is None: + idx = torch.cuda.current_device() + default_generator = torch.cuda.default_generators[idx] + default_generator.set_state(new_state) + + _lazy_call(cb) + + + +class CudaRNGStatesTracker: + """Tracker for the cuda RNG states. + + Using the `add` method, a cuda rng state is initialized based on + the input `seed` and is assigned to `name`. Later, by forking the + rng state, we can perform operations and return to our starting + cuda state. + """ + def __init__(self): + # Map from a string name to the cuda rng state. + self.states_ = {} + # Seeds are just for book keeping and ensure no seed is set twice. + self.seeds_ = set() + + def reset(self): + """Set to the initial state (no tracker).""" + self.states_ = {} + self.seeds_ = set() + + def get_states(self): + """Get rng states. Copy the dictionary so we have direct + pointers to the states, not just a pointer to the dictionary.""" + states = {} + for name in self.states_: + states[name] = self.states_[name] + return states + + def set_states(self, states): + """Set the rng states. For efficiency purposes, we do not check + the size of seed for compatibility.""" + self.states_ = states + + def add(self, name, seed): + """Track the rng state.""" + # Check seed is not already used. + if seed in self.seeds_: + raise Exception('seed {} already exists'.format(seed)) + self.seeds_.add(seed) + # Check that state is not already defined. + if name in self.states_: + raise Exception('cuda rng state {} already exists'.format(name)) + # Get the current rng state. + orig_rng_state = torch.cuda.get_rng_state() + # Set the new state and store it. + torch.cuda.manual_seed(seed) + self.states_[name] = torch.cuda.get_rng_state() + # Reset rng state to what it was. + _set_cuda_rng_state(orig_rng_state) + + @contextlib.contextmanager + def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): + """Fork the cuda rng state, perform operations, and exit with + the original state.""" + # Check if we have added the state + if name not in self.states_: + raise Exception('cuda rng state {} is not added'.format(name)) + # Store current rng state. + orig_cuda_rng_state = torch.cuda.get_rng_state() + # Set rng state to the desired one + _set_cuda_rng_state(self.states_[name]) + # Do the stuff we wanted to do. + try: + yield + finally: + # Update the current rng state for later use. + self.states_[name] = torch.cuda.get_rng_state() + # And set the state to the original state we started with. + _set_cuda_rng_state(orig_cuda_rng_state) + + +# RNG tracker object. +_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() + + +def get_cuda_rng_tracker(): + """Get cuda rng tracker.""" + return _CUDA_RNG_STATE_TRACKER + + +def model_parallel_cuda_manual_seed(seed): + """Initialize model parallel cuda seed. + + This function should be called after the model parallel is + initialized. Also, no torch.cuda.manual_seed should be called + after this function. Basically, this is replacement for that + function. + Two set of RNG states are tracked: + default state: This is for data parallelism and is the same among a + set of model parallel GPUs but different across + different model paralle groups. This is used for + example for dropout in the non-model-parallel regions. + model-parallel state: This state is different among a set of model + parallel GPUs, but the same across data parallel + groups. This is used for example for dropout in + model parallel regions. + """ + # 2718 is just for fun and any POSITIVE value will work. + offset = seed + 2718 + model_parallel_seed = offset + get_model_parallel_rank() + # Data parallel gets the original sedd. + data_parallel_seed = seed + + if torch.distributed.get_rank() == 0: + print('> initializing model parallel cuda seeds on global rank {}, ' + 'model parallel rank {}, and data parallel rank {} with ' + 'model parallel seed: {} and data parallel seed: {}'.format( + torch.distributed.get_rank(), get_model_parallel_rank(), + get_data_parallel_rank(), model_parallel_seed, + data_parallel_seed), flush=True) + _CUDA_RNG_STATE_TRACKER.reset() + # Set the default state. + torch.cuda.manual_seed(data_parallel_seed) + # and model parallel state. + _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, + model_parallel_seed) + + +def get_partition_start(item): + global mp_rank, mp_size, mp_group + partition_size = get_partition_size(item) + start = partition_size * mp_rank + return int(start) + +def get_partition_size(item): + global mp_rank, mp_size, mp_group + size = item.numel() + partition_size = size/mp_size + return int(partition_size) + +def get_full_inputs(tensors): + inputs=[] + for i in range(int(len(tensors)/2)-1): + item = tensors[2 * i] + size = tensors[2* i + 1] + partition_size = item.numel() + tensor_size = partition_size * mp_size + flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device) + partitions=[] + for i in range(mp_size): + part_i = flat_tensor.narrow(0, partition_size * i , partition_size) + if i == mp_rank: + part_i.copy_(item) + partitions.append(part_i) + dist.all_gather(partitions,partitions[mp_rank], group=mp_group) + input_tensor = flat_tensor.view(list(size.numpy())) + item.data=input_tensor.data + + inputs.append(item) + inputs.append(tensors[-2]) + + return tuple(inputs) + + + +class CheckpointFunction(torch.autograd.Function): + """This function is adapted from torch.utils.checkpoint with + two main changes: + 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` + 2) the states in the model parallel tracker are also properly + tracked/set/reset. + """ + @staticmethod + def forward(ctx, run_function, *args): + ctx.run_function = run_function + global mp_rank, mp_size, mp_group + if mp_rank is None: + mp_rank = get_model_parallel_rank() + mp_size = get_model_parallel_world_size() + mp_group = get_model_parallel_group() + + + global cuda_device, transport_stream, PARTITION_ACTIVATIONS + if cuda_device is None: + if dist.get_rank() == 0: + print(f"Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}") + + cuda_device = torch.cuda.current_device() + #The transport stream is used to overlap the allgather communication for the activations + #with the computation in the backward pass + transport_stream = torch.cuda.Stream(device=cuda_device) + + if PARTITION_ACTIVATIONS: + #inputs = [item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), get_partition_size(item)).clone() for item in args[:-1]] + #inputs.append(args[-1]) + print_rank_0("args: ", args) + inputs = [item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), get_partition_size(item)).clone() for item in args[:2]] + inputs.extend(args[2:]) + + + #just in case something funky is happening such as reuse of inputs + inputs_cuda = [item.to(cuda_device) for item in args] + + # Copy the rng states. + ctx.fwd_cpu_rng_state = torch.get_rng_state() + ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state() + ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() + + #ctx.save_for_backward(*args) + with torch.no_grad(): + outputs = run_function(*inputs_cuda) + + del inputs_cuda + + if PARTITION_ACTIVATIONS: + new_args = [] + for arg, inp in zip(args,inputs): + size= torch.tensor(arg.size()) + arg.data = inp.data + new_args.append(arg) + new_args.append(size) + ctx.save_for_backward(*new_args) + else: + ctx.save_for_backward(*args) + + return outputs + + @staticmethod + def backward(ctx, *args): + if not torch.autograd._is_checkpoint_valid(): + raise RuntimeError("Checkpointing is not compatible with .grad(), " + "please use .backward() if possible") + + global cuda_device, transport_stream, PARTITION_ACTIVATIONS + + if PARTITION_ACTIVATIONS: + with torch.cuda.stream(transport_stream): + inputs = get_full_inputs(ctx.saved_tensors) + detached_inputs = detach_variable(inputs) + else: + inputs = ctx.saved_tensors + detached_inputs = detach_variable(inputs) + + # Store the current states. + bwd_cpu_rng_state = torch.get_rng_state() + bwd_cuda_rng_state = torch.cuda.get_rng_state() + bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() + + # Set the states to what it used to be before the forward pass. + torch.set_rng_state(ctx.fwd_cpu_rng_state) + _set_cuda_rng_state(ctx.fwd_cuda_rng_state) + get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker) + + if PARTITION_ACTIVATIONS: + current_stream=torch.cuda.current_stream() + current_stream.wait_stream(transport_stream) + + with torch.enable_grad(): + outputs = ctx.run_function(*detached_inputs) + + # Set the states back to what it was at the start of this function. + torch.set_rng_state(bwd_cpu_rng_state) + _set_cuda_rng_state(bwd_cuda_rng_state) + get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker) + + if isinstance(outputs, torch.Tensor): + outputs = (outputs,) + torch.autograd.backward(outputs, args) + return (None,) + tuple(inp.grad for inp in detached_inputs) + + +def checkpoint(function, *args): + """Checkpoint a model or part of the model. + This has been directly copied from torch.utils.checkpoint.""" + return CheckpointFunction.apply(function, *args) + +def partition_activations_in_checkpoint(partition_activation): + global PARTITION_ACTIVATIONS + PARTITION_ACTIVATIONS=partition_activation + if dist.get_rank() == 0: + print(f"**************Partition Activations {PARTITION_ACTIVATIONS}************") + + diff --git a/modelscope/utils/nlp/mpu/tests/__init__.py b/modelscope/utils/nlp/mpu/tests/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/modelscope/utils/nlp/mpu/tests/commons.py b/modelscope/utils/nlp/mpu/tests/commons.py new file mode 100755 index 00000000..be986e58 --- /dev/null +++ b/modelscope/utils/nlp/mpu/tests/commons.py @@ -0,0 +1,82 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import random +import numpy +import torch + +import mpu + + +class IdentityLayer(torch.nn.Module): + def __init__(self, size, scale=1.0): + super(IdentityLayer, self).__init__() + self.weight = torch.nn.Parameter(scale * torch.randn(size)) + def forward(self): + return self.weight + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + random.seed(seed) + numpy.random.seed(seed) + torch.manual_seed(seed) + mpu.model_parallel_cuda_manual_seed(seed) + + +def initialize_distributed(backend='nccl'): + """Initialize torch.distributed.""" + # Get local rank in case it is provided. + parser = argparse.ArgumentParser() + parser.add_argument('--local_rank', type=int, default=None, + help='local rank passed from distributed launcher') + args = parser.parse_args() + local_rank = args.local_rank + + # Get rank and world size. + rank = int(os.getenv('RANK', '0')) + world_size = int(os.getenv("WORLD_SIZE", '1')) + + print('> initializing torch.distributed with local rank: {}, ' + 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) + + # Set the device id. + device = rank % torch.cuda.device_count() + if local_rank is not None: + device = local_rank + #torch.cuda.set_device(device) + + # Call the init process. + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', 'localhost') + master_port = os.getenv('MASTER_PORT', '6000') + init_method += master_ip + ':' + master_port + torch.distributed.init_process_group( + backend=backend, + world_size=world_size, + rank=rank, + init_method=init_method) + + +def print_separator(message): + torch.distributed.barrier() + filler_len = (78 - len(message)) // 2 + filler = '-' * filler_len + string = '\n' + filler + ' {} '.format(message) + filler + if torch.distributed.get_rank() == 0: + print(string, flush=True) + torch.distributed.barrier() diff --git a/modelscope/utils/nlp/mpu/tests/test_cross_entropy.py b/modelscope/utils/nlp/mpu/tests/test_cross_entropy.py new file mode 100755 index 00000000..20875971 --- /dev/null +++ b/modelscope/utils/nlp/mpu/tests/test_cross_entropy.py @@ -0,0 +1,110 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import sys +sys.path.append("../..") + +import torch +import torch.nn.functional as F +import mpu +from mpu.cross_entropy import vocab_parallel_cross_entropy + +from commons import initialize_distributed +from commons import print_separator +from commons import IdentityLayer +from commons import set_random_seed + + +def torch_cross_entropy(batch_size, seq_length, vocab_size, + logits_scale, seed): + set_random_seed(seed) + identity = IdentityLayer((batch_size, seq_length, vocab_size), + scale=logits_scale).cuda() + logits = identity() + target = torch.cuda.LongTensor( + size=(batch_size, seq_length)).random_(0, vocab_size) + loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), + target.view(-1), + reduction='none').view_as(target).mean() + loss.backward() + return loss, identity.weight.grad + + +def mpu_cross_entropy(batch_size, seq_length, vocab_size, + logits_scale, seed): + set_random_seed(seed) + identity = IdentityLayer((batch_size, seq_length, vocab_size), + scale=logits_scale).cuda() + logits = identity() + logits_parallel = mpu.scatter_to_model_parallel_region(logits) + target = torch.cuda.LongTensor( + size=(batch_size, seq_length)).random_(0, vocab_size) + loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() + loss.backward() + return loss, identity.weight.grad + + +def test_cross_entropy(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing cross entropy with model parallel size {} ...'. + format(model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + batch_size = 13 + seq_length = 17 + vocab_size_per_partition = 11 + logits_scale = 1000.0 + vocab_size = vocab_size_per_partition * model_parallel_size + seed = 1234 + + loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, + vocab_size, logits_scale, + seed) + loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, + vocab_size, logits_scale, + seed) + + error = loss_torch.sub_(loss_mpu).abs().max() + print(' max error in loss on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = grad_torch.sub_(grad_mpu).abs().max() + print(' max error in grad on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test cross entropy') + test_cross_entropy(model_parallel_size) + model_parallel_size *= 2 diff --git a/modelscope/utils/nlp/mpu/tests/test_data.py b/modelscope/utils/nlp/mpu/tests/test_data.py new file mode 100755 index 00000000..6e8eca73 --- /dev/null +++ b/modelscope/utils/nlp/mpu/tests/test_data.py @@ -0,0 +1,92 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import operator +import sys +sys.path.append("../..") + +import torch +import mpu +from mpu import data as data_utils + +from commons import initialize_distributed +from commons import print_separator + + +def test_boradcast_data(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing boradcast_data with model parallel size {} ...'. + format(model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + torch.manual_seed(1234 + mpu.get_data_parallel_rank()) + model_parallel_size = mpu.get_model_parallel_world_size() + + key_size_t = {'key1': [7, 11], + 'key2': [8, 2, 1], + 'key3': [13], + 'key4': [5, 1, 2], + 'key5': [5, 12]} + keys = list(key_size_t.keys()) + + data = {} + data_t = {} + for key in key_size_t: + data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) + data_t[key] = data[key].clone() + data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) + data_t['keyX'] = data['keyX'].clone() + if mpu.get_model_parallel_rank() != 0: + data = None + + data_utils._check_data_types(keys, data_t, torch.int64) + key_size, key_numel, \ + total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) + for key in keys: + assert key_size[key] == key_size_t[key] + total_numel_t = 0 + for key in keys: + target_size = functools.reduce(operator.mul, key_size_t[key], 1) + assert key_numel[key] == target_size + total_numel_t += target_size + assert total_numel == total_numel_t + + data_b = data_utils.broadcast_data(keys, data, torch.int64) + for key in keys: + tensor = data_t[key].cuda() + assert data_b[key].sub(tensor).abs().max() == 0 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test test boradcast data') + test_boradcast_data(model_parallel_size) + model_parallel_size *= 2 + + diff --git a/modelscope/utils/nlp/mpu/tests/test_initialize.py b/modelscope/utils/nlp/mpu/tests/test_initialize.py new file mode 100755 index 00000000..c77e2e6a --- /dev/null +++ b/modelscope/utils/nlp/mpu/tests/test_initialize.py @@ -0,0 +1,98 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append("../..") + +import torch +import mpu + +from commons import initialize_distributed +from commons import print_separator + + +def test_initialize_model_parallel(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing initialize_model_parallel with size {} ...'.format( + model_parallel_size)) + model_parallel_size_ = min(model_parallel_size, + torch.distributed.get_world_size()) + assert not mpu.model_parallel_is_initialized() + mpu.initialize_model_parallel(model_parallel_size_) + assert mpu.model_parallel_is_initialized() + + # Checks. + def check(group, world_size, rank): + assert world_size == torch.distributed.get_world_size(group=group) + assert rank == torch.distributed.get_rank(group=group) + + # Model parallel. + world_size = model_parallel_size_ + rank = torch.distributed.get_rank() % model_parallel_size_ + assert world_size == mpu.get_model_parallel_world_size() + assert rank == mpu.get_model_parallel_rank() + check(mpu.get_model_parallel_group(), world_size, rank) + + + # Data parallel. + world_size = torch.distributed.get_world_size() // model_parallel_size_ + rank = torch.distributed.get_rank() // model_parallel_size + assert world_size == mpu.get_data_parallel_world_size() + assert rank == mpu.get_data_parallel_rank() + check(mpu.get_data_parallel_group(), world_size, rank) + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_get_model_parallel_src_rank(model_parallel_size_): + + if torch.distributed.get_rank() == 0: + print('> testing get_model_parallel_src_rank with size {} ...'.format( + model_parallel_size_)) + model_parallel_size = min(model_parallel_size_, + torch.distributed.get_world_size()) + assert not mpu.model_parallel_is_initialized() + mpu.initialize_model_parallel(model_parallel_size) + assert mpu.model_parallel_is_initialized() + + # Checks + src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() + assert mpu.get_model_parallel_src_rank() == src_rank + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test initialize model parallel') + test_initialize_model_parallel(model_parallel_size) + print_separator('test model parallel source rank') + test_get_model_parallel_src_rank(model_parallel_size) + model_parallel_size *= 2 diff --git a/modelscope/utils/nlp/mpu/tests/test_layers.py b/modelscope/utils/nlp/mpu/tests/test_layers.py new file mode 100755 index 00000000..c38bf725 --- /dev/null +++ b/modelscope/utils/nlp/mpu/tests/test_layers.py @@ -0,0 +1,529 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import sys +sys.path.append("../..") + +import torch +import torch.nn.init as init +from torch.nn.parameter import Parameter +import mpu + +from commons import initialize_distributed +from commons import print_separator +from commons import set_random_seed +from mpu import layers + + +def test_parallel_embedding(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing parallel embedding with model parallel size {} ...'. + format(model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + batch_size = 17 + seq_length = 23 + vocab_size = 48 + hidden_size = 16 + seed = 1236 + + set_random_seed(123) + input_data = torch.LongTensor( + size=(batch_size,seq_length)).random_(0, vocab_size).cuda() + loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda() + + set_random_seed(seed) + embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda() + + output = embedding_original(input_data) + loss_original = torch.mul(output, loss_weight).sum() + loss_original.backward() + + set_random_seed(seed) + embedding_parallel = layers.ParallelEmbedding( + vocab_size, hidden_size, init_method=init.normal_).cuda() + output = embedding_parallel(input_data) + loss_parallel = torch.mul(output, loss_weight).sum() + loss_parallel.backward() + + set_random_seed(seed) + embedding_vocab_parallel = layers.VocabParallelEmbedding( + vocab_size, hidden_size, init_method=init.normal_).cuda() + output = embedding_vocab_parallel(input_data) + loss_vocab_parallel = torch.mul(output, loss_weight).sum() + loss_vocab_parallel.backward() + + torch.distributed.barrier() + error = loss_parallel.sub(loss_original).abs() + print(' error in loss (parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + torch.distributed.barrier() + error = loss_vocab_parallel.sub(loss_original).abs() + print(' error in loss (vocab parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + weight_grad_orig = torch.split(embedding_original.weight.grad, + hidden_size // model_parallel_size, + 1)[mpu.get_model_parallel_rank()] + error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max() + print(' error in grad (parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + weight_grad_orig = torch.split(embedding_original.weight.grad, + vocab_size // model_parallel_size, + 0)[mpu.get_model_parallel_rank()] + error = embedding_vocab_parallel.weight.grad.sub( + weight_grad_orig).abs().max() + print(' error in grad (vocab parallel) on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-12, 'error: {}'.format(error) + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_initialize_affine_weight(model_parallel_size): + + mpu.initialize_model_parallel(model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing initialize_affine_weight with model parallel ' + 'size: {}'.format(model_parallel_size)) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + input_size_coeff = 13 + input_size = input_size_coeff * model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * model_parallel_size + + # --------------- + # Column parallel + # --------------- + weight = torch.empty(output_size_coeff, input_size) + set_random_seed(seed) + layers._initialize_affine_weight(weight, output_size, input_size, + + output_size_coeff, 0, + torch.nn.init.normal_) + # Target. + set_random_seed(seed) + master_weight = torch.empty(output_size, input_size) + torch.nn.init.normal_(master_weight) + rank = mpu.get_model_parallel_rank() + my_weight = torch.split(master_weight, output_size_coeff, + dim=0)[rank].contiguous().clone() + + # Compare. + error = weight.sub(my_weight).abs().max() + torch.distributed.barrier() + print(' column parallel max error (should be zero) on global rank ' + '{}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # ------------ + # Row parallel + # ------------ + weight = torch.empty(output_size, input_size_coeff) + set_random_seed(seed) + mpu.layers._initialize_affine_weight(weight, output_size, input_size, + input_size_coeff, 1, + torch.nn.init.normal_) + # Target. + set_random_seed(seed) + master_weight = torch.empty(output_size, input_size) + torch.nn.init.normal_(master_weight) + rank = mpu.get_model_parallel_rank() + my_weight = torch.split(master_weight, input_size_coeff, + dim=1)[rank].contiguous().clone() + + # Compare. + error = weight.sub(my_weight).abs().max() + torch.distributed.barrier() + print(' row parallel max error (should be zero) on global rank ' + '{}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +class IdentityLayer2D(torch.nn.Module): + def __init__(self, m , n): + super(IdentityLayer2D, self).__init__() + self.weight = Parameter(torch.Tensor(m, n)) + torch.nn.init.xavier_normal_(self.weight) + def forward(self): + return self.weight + + +def test_column_parallel_linear(model_parallel_size): + + mpu.initialize_model_parallel(model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing ColumnParallelLinear with model parallel ' + 'size: {}'.format(model_parallel_size)) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + input_size_coeff = 13 + input_size = input_size_coeff * model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * model_parallel_size + batch_size = 7 + + # Network + identity_layer = IdentityLayer2D(batch_size, input_size).cuda() + linear_layer = mpu.ColumnParallelLinear( + input_size, output_size, keep_master_weight_for_test=True).cuda() + loss_weight = torch.randn([batch_size, output_size]).cuda() + # Forward + input_ = identity_layer() + output = linear_layer(input_) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + # Values. + dLdY = loss_weight + X = identity_layer.weight + A = linear_layer.master_weight.cuda() + dLdA = torch.matmul(dLdY.t(), X) + dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) + dLdX = torch.matmul(dLdY, A) + + rank = mpu.get_model_parallel_rank() + my_dLdA = torch.split(dLdA, output_size_coeff, + dim=0)[rank].contiguous().clone() + error = my_dLdA.sub(linear_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdA on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + my_dLdb = torch.split(dLdb, output_size_coeff, + dim=0)[rank].contiguous().clone() + error = my_dLdb.sub(linear_layer.bias.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdb on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdX.sub(identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdX on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +def test_row_parallel_linear(model_parallel_size): + + mpu.initialize_model_parallel(model_parallel_size) + if torch.distributed.get_rank() == 0: + print('> testing RowParallelLinear with model parallel ' + 'size: {}'.format(model_parallel_size)) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + input_size_coeff = 13 + input_size = input_size_coeff * model_parallel_size + output_size_coeff = 17 + output_size = output_size_coeff * model_parallel_size + batch_size = 7 + + # Network + identity_layer = IdentityLayer2D(batch_size, input_size).cuda() + linear_layer = mpu.RowParallelLinear( + input_size, output_size, keep_master_weight_for_test=True).cuda() + loss_weight = torch.randn([batch_size, output_size]).cuda() + # Forward + input_ = identity_layer() + output = linear_layer(input_) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + # Values. + dLdY = loss_weight + X = identity_layer.weight + A = linear_layer.master_weight.cuda() + dLdA = torch.matmul(dLdY.t(), X) + dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) + dLdX = torch.matmul(dLdY, A) + + rank = mpu.get_model_parallel_rank() + my_dLdA = torch.split(dLdA, input_size_coeff, + dim=1)[rank].contiguous().clone() + error = my_dLdA.sub(linear_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdA on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdb.sub(linear_layer.bias.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdb on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + error = dLdX.sub(identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' error in dLdX on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +class IdentityLayer3D(torch.nn.Module): + def __init__(self, m , n, k): + super(IdentityLayer3D, self).__init__() + self.weight = Parameter(torch.Tensor(m, n, k)) + torch.nn.init.xavier_normal_(self.weight) + def forward(self): + return self.weight + + +def parallel_self_attention(model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, + sequence_length): + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + + num_att_heads = num_att_heads_per_partition * \ + torch.distributed.get_world_size() + hidden_size = hidden_size_per_att_head * num_att_heads + + # Network + identity_layer = IdentityLayer3D(batch_size, sequence_length, + hidden_size).cuda() + attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads, + dropout_prob).cuda() + loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() + attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() + # Forward + input_ = identity_layer() + output = attention_layer(input_, attention_mask) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + rank = mpu.get_model_parallel_rank() + mpu.destroy_model_parallel() + return rank, hidden_size, model_parallel_size, loss, \ + attention_layer, identity_layer + + +def test_parallel_self_attention(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing ParallelSelfAttention with model parallel ' + 'size: {}'.format(model_parallel_size)) + + num_att_heads_per_partition = 3 + hidden_size_per_att_head = 7 + dropout_prob = 0.0 # has to be zero + batch_size = 5 + sequence_length = 13 + + rank_1, hideen_size_1, model_parallel_size_1, loss_1, \ + attention_layer_1, identity_layer_1 =parallel_self_attention( + 1, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) + + rank, hidden_size, model_parallel_size, loss, \ + attention_layer, identity_layer =parallel_self_attention( + model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, dropout_prob, batch_size, sequence_length) + assert hideen_size_1 == hidden_size + + error = loss_1.sub(loss).abs().max() + torch.distributed.barrier() + print(' loss error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + my_lin_grad_list = torch.split( + attention_layer_1.query_key_value.weight.grad, + hidden_size // model_parallel_size, 0)[rank::model_parallel_size] + my_lin_grad = torch.cat(my_lin_grad_list, dim=0) + error = my_lin_grad.sub( + attention_layer.query_key_value.weight.grad).abs().max() + torch.distributed.barrier() + print(' weight gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + error = identity_layer_1.weight.grad.sub( + identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' input gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-6 + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + +def parallel_transformer(model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, sequence_length): + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed = 12345 + set_random_seed(seed) + + num_att_heads = num_att_heads_per_partition * \ + torch.distributed.get_world_size() + hidden_size = hidden_size_per_att_head * num_att_heads + intermediate_size = 4 * hidden_size + + # Network + identity_layer = IdentityLayer3D(batch_size, sequence_length, + hidden_size).cuda() + transformer_layer = mpu.BertParallelTransformerLayer( + hidden_size, intermediate_size, num_att_heads, 0.0, 0.0, + torch.nn.functional.relu, 1.0e-5).cuda() + + loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() + attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() + # Forward + input_ = identity_layer() + output = transformer_layer(input_, attention_mask) + loss = torch.mul(output, loss_weight).sum() + # Backward + loss.backward() + + rank = mpu.get_model_parallel_rank() + mpu.destroy_model_parallel() + return rank, hidden_size, model_parallel_size, loss, \ + transformer_layer, identity_layer + + +def test_parallel_transformer_layer(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing ParallelTransformerLayer with model parallel ' + 'size: {}'.format(model_parallel_size)) + + num_att_heads_per_partition = 3 + hidden_size_per_att_head = 7 + batch_size = 5 + sequence_length = 13 + + rank_1, hidden_size_1, model_parallel_size_1, loss_1, \ + transformer_layer_1, identity_layer_1 = parallel_transformer( + 1, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, sequence_length) + + rank, hidden_size, model_parallel_size, loss, \ + transformer_layer, identity_layer = parallel_transformer( + model_parallel_size, num_att_heads_per_partition, + hidden_size_per_att_head, batch_size, sequence_length) + + error = loss_1.sub(loss).abs().max() + torch.distributed.barrier() + print(' loss error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-5, 'error: {}'.format(error) + + error = identity_layer_1.weight.grad.sub( + identity_layer.weight.grad).abs().max() + torch.distributed.barrier() + print(' input gradient error on global rank {}: {}'.format( + torch.distributed.get_rank(), error)) + assert error < 5.0e-5, 'error: {}'.format(error) + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(' >> passed the test :-)') + + +if __name__ == '__main__': + + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + print_separator('test initialize affine weight') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_initialize_affine_weight(model_parallel_size) + model_parallel_size *= 2 + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test parallel embedding') + test_parallel_embedding(model_parallel_size) + model_parallel_size *= 2 + + print_separator('test column-parallel linear') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_column_parallel_linear(model_parallel_size) + model_parallel_size *= 2 + + print_separator('test row-parallel linear') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_row_parallel_linear(model_parallel_size) + model_parallel_size *= 2 + + print_separator('test parallel self-attention') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_parallel_self_attention(model_parallel_size) + model_parallel_size *= 2 + + print_separator('test parallel transformer') + model_parallel_size = 1 + while model_parallel_size <= world_size: + test_parallel_transformer_layer(model_parallel_size) + model_parallel_size *= 2 diff --git a/modelscope/utils/nlp/mpu/tests/test_random.py b/modelscope/utils/nlp/mpu/tests/test_random.py new file mode 100755 index 00000000..e3792089 --- /dev/null +++ b/modelscope/utils/nlp/mpu/tests/test_random.py @@ -0,0 +1,207 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append("../..") + +import torch +import mpu + +from commons import initialize_distributed +from commons import print_separator + + +def test_set_cuda_rng_state(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing set_rng_state with size {} ...'. + format(model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + size = 123 + seed = 1234 + torch.cuda.manual_seed(1234) + tensor = torch.cuda.FloatTensor(size) + + # Get the state + rng_state = torch.cuda.get_rng_state() + rng_state_copy = rng_state.clone() + + # Do some stuff. + for _ in range(5): + torch.randn(size, out=tensor) + result_1 = tensor.clone() + + assert rng_state.sub(rng_state_copy).max() == 0 + assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0 + + # State should be different. + new_rng_state = torch.cuda.get_rng_state() + max_diff = new_rng_state.sub(rng_state).max() + print(' max diff in rng state (should be non-zero) on global rank {}: {}'. + format(torch.distributed.get_rank(), max_diff)) + assert max_diff > 0 + + # Reset the rng state and do the same stuff. + mpu.random._set_cuda_rng_state(rng_state) + for _ in range(5): + torch.randn(size, out=tensor) + mpu.random._set_cuda_rng_state(rng_state) + for _ in range(5): + torch.randn(size, out=tensor) + result_2 = tensor.clone() + + # Results should be the same + error = result_2.sub(result_1).abs().max() + print(' max error in generated tensors (should be zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Input state should have remained intact. + error = rng_state.sub(rng_state_copy).max() + print(' max error in rng state (should be zero) on global rank {}: {}'. + format(torch.distributed.get_rank(), error)) + assert error == 0 + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_cuda_rng_tracker(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing cuda rng tracker with size {} ...'. + format(model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + seed_1 = 1234 + seed_2 = 4321 + size = [12, 21] + tensor = torch.cuda.FloatTensor(size) + + # Set to seed_1 and generate two tensors. + torch.cuda.manual_seed(seed_1) + torch.randn(size, out=tensor) + target_11 = tensor.clone() + torch.randn(size, out=tensor) + target_12 = tensor.clone() + + # Set to seed_2 and generate two tensors. + torch.cuda.manual_seed(seed_2) + torch.randn(size, out=tensor) + target_21 = tensor.clone() + torch.randn(size, out=tensor) + target_22 = tensor.clone() + + # Now if we interleave seed_1 and seed_2, + # we should still get the same tensors + torch.cuda.manual_seed(seed_1) + mpu.get_cuda_rng_tracker().add('test', seed_2) + + torch.randn(size, out=tensor) + result_11 = tensor.clone() + + with mpu.get_cuda_rng_tracker().fork('test'): + torch.randn(size, out=tensor) + result_21 = tensor.clone() + + torch.randn(size, out=tensor) + result_12 = tensor.clone() + + with mpu.get_cuda_rng_tracker().fork('test'): + torch.randn(size, out=tensor) + result_22 = tensor.clone() + + diff = result_11.sub(result_21).abs().max() + diff = min(diff, result_12.sub(result_22).abs().max()) + print(' max diff in generated tensors (should be non-zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), diff)) + assert diff > 1.0e-6 + error = max(result_11.sub(target_11).abs().max(), + result_12.sub(target_12).abs().max()) + error = max(error, result_21.sub(target_21).abs().max()) + error = max(error, result_22.sub(target_22).abs().max()) + print(' max error in generated tensors (should be zero) on ' + 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) + assert error < 1.0e-6 + + # Reset the tracker + mpu.get_cuda_rng_tracker().reset() + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +def test_model_parallel_cuda_manual_seed(model_parallel_size): + + if torch.distributed.get_rank() == 0: + print('> testing model parallel cuda manual seed with size {} ...'. + format(model_parallel_size)) + + mpu.initialize_model_parallel(model_parallel_size) + model_parallel_size = mpu.get_model_parallel_world_size() + + mpu.model_parallel_cuda_manual_seed(12345) + assert torch.cuda.initial_seed() == 12345 + with mpu.get_cuda_rng_tracker().fork(): + assert torch.cuda.initial_seed() == (12345 + 2718 + + mpu.get_model_parallel_rank()) + + # Reset the tracker + mpu.get_cuda_rng_tracker().reset() + + # Reset groups + mpu.destroy_model_parallel() + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print('>> passed the test :-)') + + +if __name__ == '__main__': + + initialize_distributed() + world_size = torch.distributed.get_world_size() + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test set rng state') + test_set_cuda_rng_state(model_parallel_size) + model_parallel_size *= 2 + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test cuda rng tracker') + test_cuda_rng_tracker(model_parallel_size) + model_parallel_size *= 2 + + model_parallel_size = 1 + while model_parallel_size <= world_size: + print_separator('test model parallel cuda manual seed') + test_model_parallel_cuda_manual_seed(model_parallel_size) + model_parallel_size *= 2 + diff --git a/modelscope/utils/nlp/mpu/transformer.py b/modelscope/utils/nlp/mpu/transformer.py new file mode 100755 index 00000000..aa16ff06 --- /dev/null +++ b/modelscope/utils/nlp/mpu/transformer.py @@ -0,0 +1,1065 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Transformer.""" + +import math + +import torch +import torch.nn.init as init +from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm + +from .initialize import get_model_parallel_world_size +from .layers import ColumnParallelLinear +from .layers import RowParallelLinear +from .mappings import gather_from_model_parallel_region + +import deepspeed + +from .random import checkpoint +from .random import get_cuda_rng_tracker + +from .utils import divide +from .utils import split_tensor_along_last_dim +from ..utils import print_rank_0 +from deepspeed.utils.timer import SynchronizedWallClockTimer + + + +class GPT2ParallelSelfAttention(torch.nn.Module): + """Parallel self-attention layer for GPT2. + + Self-attention layer takes input with size [b, s, h] where b is + the batch size, s is the sequence lenght, and h is the hidden size + and creates output of the same size. + Arguments: + hidden_size: total hidden size of the layer (h). + num_attention_heads: number of attention heads (n). Note that we + require n to be divisible by number of GPUs + used to parallelize the model. Also, we + require hidden size to be divisible by n. + dropout_prob: dropout probability for the attention scores. + init_method: weight initialization. + output_layer_init_method: output layer initialization. If None, use + `init_method`. + We use the following notation: + h: hidden_size + n: num_attention_heads + p: number of partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + """ + def __init__(self, hidden_size, num_attention_heads, + attention_dropout_prob, output_dropout_prob, + init_method, output_layer_init_method=None, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0, + LR_weight_rank=8, LR_mask_rank=8): + super(GPT2ParallelSelfAttention, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size_per_partition = divide(hidden_size, world_size) + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = divide(num_attention_heads, + world_size) + # Strided linear layer. + self.query_key_value = ColumnParallelLinear(hidden_size, 3*hidden_size, + stride=3, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) + + # Output. + self.dense = RowParallelLinear(hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + self.output_dropout = torch.nn.Dropout(output_dropout_prob) + + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward(self, hidden_states, ltor_mask, is_infer=False, pruning_threshold=None,): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Attention heads. [b, s, hp] + tgt_len = hidden_states.size(1) + mixed_x_layer = self.query_key_value(hidden_states, pruning_threshold=pruning_threshold,) + (mixed_query_layer, + mixed_key_layer, + mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # Reshape and transpose [b, np, s, hn] + query_layer = self._transpose_for_scores(mixed_query_layer) + key_layer = self._transpose_for_scores(mixed_key_layer) + value_layer = self._transpose_for_scores(mixed_value_layer) + + previous_type = value_layer.type() + + # Raw attention scores. [b, np, s, s] + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt( + self.hidden_size_per_attention_head) + # Apply the left to right attention mask. + if is_infer: + src_len = key_layer.size(2) + ltor_mask = torch.tril(torch.ones( + (1, tgt_len, src_len), device=hidden_states.device)).view(1, 1, tgt_len, src_len).type(previous_type) + attention_scores = torch.mul(attention_scores, ltor_mask) - \ + 10000.0 * (1.0 - ltor_mask) + + # Attention probabilities. [b, np, s, s] + attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + with get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + + # Context layer. + # [b, np, s, hn] + context_layer = torch.matmul(attention_probs, value_layer) + # [b, s, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + # [b, s, hp] + context_layer = context_layer.view(*new_context_layer_shape) + + # Output. [b, s, h] + output = self.dense(context_layer, pruning_threshold=pruning_threshold,) + output = self.output_dropout(output) + + return output + + +@torch.jit.script +def gelu_impl(x): + """OpenAI's gelu implementation.""" + return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * + (1.0 + 0.044715 * x * x))) + +def gelu(x): + return gelu_impl(x) + + +class GPT2ParallelMLP(torch.nn.Module): + """MLP for GPT2. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform gelu transformation, and project the + state back into h hidden dimension. At the end, dropout is also + applied. + + Arguments: + hidden_size: The hidden size of the self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layer initialization. If None, + use `init_method`. + """ + + def __init__(self, hidden_size, output_dropout_prob, init_method, + output_layer_init_method=None, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0): + super(GPT2ParallelMLP, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Project to 4h. + self.dense_h_to_4h = ColumnParallelLinear(hidden_size, 4*hidden_size, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + # Project back to h. + self.dense_4h_to_h = RowParallelLinear( + 4*hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + self.dropout = torch.nn.Dropout(output_dropout_prob) + + def forward(self, hidden_states, pruning_threshold=None,): + # [b, s, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states, pruning_threshold=pruning_threshold,) + intermediate_parallel = gelu(intermediate_parallel) + + # [b, s, h] + output = self.dense_4h_to_h(intermediate_parallel, pruning_threshold=pruning_threshold,) + output = self.dropout(output) + return output + + +class GPT2ParallelTransformerLayer(torch.nn.Module): + """A single layer transformer for GPT2. + + We use the following notation: + h: hidden size + n: number of attention heads + b: batch size + s: sequence length + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + + Arguments: + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layers (attention output and + mlp output) initialization. If None, + use `init_method`. + """ + def __init__(self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + init_method, + output_layer_init_method=None, + pruning_method=None, + pruning_mask_init='constant', + pruning_mask_scale=0.0): + super(GPT2ParallelTransformerLayer, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + + # Layernorm on the input data. + self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + # Self attention. + self.attention = GPT2ParallelSelfAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + + # Layernorm on the input data. + self.post_attention_layernorm = LayerNorm(hidden_size, + eps=layernorm_epsilon) + + # MLP + self.mlp = GPT2ParallelMLP( + hidden_size, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + + def forward(self, hidden_states, ltor_mask, pruning_threshold=None): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Layer norm at the begining of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output = self.attention(layernorm_output, ltor_mask, pruning_threshold=pruning_threshold,) + # Residual connection. + layernorm_input = hidden_states + attention_output + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + # MLP. + mlp_output = self.mlp(layernorm_output, pruning_threshold=pruning_threshold,) + # Second residual connection. + output = layernorm_input + mlp_output + + return output + + +def unscaled_init_method(sigma): + """Init method based on N(0, sigma).""" + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +def scaled_init_method(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +class GPT2ParallelTransformer(torch.nn.Module): + """GPT-2 transformer. + + This module takes input from embedding layer and it's output can + be used directly by a logit layer. It consists of L (num-layers) + blocks of: + layer norm + self attention + residual connection + layer norm + mlp + residual connection + followed by a final layer norm. + + Arguments: + num_layers: Number of transformer layers. + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + checkpoint_activations: if True, checkpoint activations. + checkpoint_num_layers: number of layers to checkpoint. This + is basically the chunk size in checkpoitning. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method_std: standard deviation of the init method which has + the form N(0, std). + use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers) + scaling for the output weights ( + output of self attention and mlp). + """ + def __init__(self, + num_layers, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + checkpoint_activations, + checkpoint_num_layers=1, + layernorm_epsilon=1.0e-5, + init_method_std=0.02, + use_scaled_init_for_output_weights=True, + pruning_method=None, + pruning_mask_init='constant', + pruning_mask_scale=0.0): + super(GPT2ParallelTransformer, self).__init__() + # Store activation checkpoiting flag. + self.checkpoint_activations = checkpoint_activations + self.checkpoint_num_layers = checkpoint_num_layers + + output_layer_init_method = None + if use_scaled_init_for_output_weights: + output_layer_init_method = scaled_init_method(init_method_std, + num_layers) + def get_layer(): + return GPT2ParallelTransformerLayer( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + unscaled_init_method(init_method_std), + output_layer_init_method=output_layer_init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + + # Transformer layers. + self.layers = torch.nn.ModuleList( + [get_layer() for _ in range(num_layers)]) + + # Final layer norm before output. + self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + + + def forward(self, hidden_states, attention_mask, pruning_threshold=None): + + def custom(start, end): + def custom_forward(*inputs): + layers_ = self.layers[start:end] + x_ = inputs[0] + for layer in layers_: + x_ = layer(x_, inputs[1]) + return x_ + return custom_forward + + if self.checkpoint_activations: + l = 0 + num_layers = len(self.layers) + chunk_length = self.checkpoint_num_layers + while l < num_layers: + hidden_states = checkpoint(custom(l, l+chunk_length), + hidden_states, attention_mask) + l += chunk_length + else: + for layer in self.layers: + hidden_states = layer(hidden_states, attention_mask, pruning_threshold=pruning_threshold,) + + # Final layer norm. + output = self.final_layernorm(hidden_states) + + return output + + +class BertParallelSelfAttention(torch.nn.Module): + """Parallel self-attention layer for BERT. + + Self-attention layer takes input with size [b, s, h] where b is + the batch size, s is the sequence lenght, and h is the hidden size + and creates output of the same size. + Arguments: + hidden_size: total hidden size of the layer (h). + num_attention_heads: number of attention heads (n). Note that we + require n to be divisible by number of GPUs + used to parallelize the model. Also, we + require hidden size be divisible by n. + dropout_prob: dropout probability for the attention scores. + output_parallel: If true, no all-gather is done on the output and + the output values will be per partition. + We use the following notation: + h: hidden_size + n: num_attention_heads + p: number of partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + """ + def __init__(self, hidden_size, num_attention_heads, + dropout_prob, output_parallel=False, + init_method=init.xavier_normal_, separate = False, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0, pruning_module=None, + LR_weight_rank=8, LR_mask_rank=8): + super(BertParallelSelfAttention, self).__init__() + # Input configuration. + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.dropout_prob = dropout_prob + self.output_parallel = output_parallel + self.separate = separate + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size_per_partition = divide(hidden_size, world_size) + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = divide(num_attention_heads, + world_size) + # Strided linear layer. + if not separate: + self.query_key_value = ColumnParallelLinear(hidden_size, 3*hidden_size, + stride=3, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + else: + self.query = ColumnParallelLinear(hidden_size, hidden_size, stride=1, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfqk'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + self.key = ColumnParallelLinear(hidden_size, hidden_size, stride=1, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfqk'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + self.value = ColumnParallelLinear(hidden_size, hidden_size, stride=1, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfvo'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + # self.key_value = ColumnParallelLinear(hidden_size, 2*hidden_size, stride=2, + # gather_output=False, + # init_method=init_method, + # pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_self', 'encoder_selfqk'] else None, + # pruning_mask_init=pruning_mask_init, + # pruning_mask_scale=pruning_mask_scale,) + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.dropout = torch.nn.Dropout(dropout_prob) + + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + self.timers = SynchronizedWallClockTimer() + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask, pruning_threshold=None,): + + if pruning_threshold is not None: + query_pruning_threshold = pruning_threshold + key_pruning_threshold = pruning_threshold + # value_pruning_threshold = pruning_threshold if pruning_threshold > 0.03 else 0.03 + # value_pruning_threshold = 1 - (1 - pruning_threshold)/0.99*0.95 + value_pruning_threshold = pruning_threshold + + # Attention heads. [b, s, hp] + self.timers('self attention').start() + if not self.separate: + mixed_x_layer = self.query_key_value(hidden_states, pruning_threshold=pruning_threshold) + (mixed_query_layer, + mixed_key_layer, + mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + else: + mixed_query_layer = self.query(hidden_states, pruning_threshold=pruning_threshold) + # mixed_key_value_layer = self.key_value(hidden_states, pruning_threshold=key_pruning_threshold) + # (mixed_key_layer, mixed_value_layer) = split_tensor_along_last_dim(mixed_key_value_layer, 2) + mixed_key_layer = self.key(hidden_states, pruning_threshold=pruning_threshold) + mixed_value_layer = self.value(hidden_states, pruning_threshold=pruning_threshold) + + # Reshape and transpose [b, np, s, hn] + query_layer = self._transpose_for_scores(mixed_query_layer) + key_layer = self._transpose_for_scores(mixed_key_layer) + value_layer = self._transpose_for_scores(mixed_value_layer) + + # Raw attention scores. [b, np, s, s] + norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head)) + attention_scores = torch.matmul(query_layer/norm_factor, + key_layer.transpose(-1, -2)/norm_factor) + # Apply the attention mask. + attention_scores += attention_mask + + # Attention probabilities. [b, np, s, s] + attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + with get_cuda_rng_tracker().fork(): + attention_probs = self.dropout(attention_probs) + + # Context layer. + # [b, np, s, hn] + context_layer = torch.matmul(attention_probs, value_layer) + # [b, s, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + # [b, s, hp] + context_layer = context_layer.view(*new_context_layer_shape) + + self.timers('self attention').stop() + # Output. [b, s, h] + self.timers('self attention gather').start() + if self.output_parallel: + output = context_layer + else: + output = gather_from_model_parallel_region(context_layer) + self.timers('self attention gather').stop() + timer_names = ['self attention', 'self attention gather'] + #self.timers.log(names=timer_names) + return output + + +class BertParallelTransformerOutput(torch.nn.Module): + """The output layer used after self attention and intermediate + parts of transformer layer.""" + def __init__(self, input_size, output_size, dropout_prob, + layernorm_epsilon=1.0e-12, input_is_parallel=False, + init_method=init.xavier_normal_, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0): + super(BertParallelTransformerOutput, self).__init__() + # Components. + self.dense = RowParallelLinear(input_size, + output_size, + input_is_parallel=input_is_parallel, + init_method=init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale,) + self.dropout = torch.nn.Dropout(dropout_prob) + self.layernorm = LayerNorm(output_size, eps=layernorm_epsilon) + + def forward(self, hidden_states, input_tensor, pruning_threshold=None,): + hidden_states = self.dense(hidden_states, pruning_threshold=pruning_threshold) + hidden_states = self.dropout(hidden_states) + layernorm_input = hidden_states + input_tensor + hidden_states = self.layernorm(layernorm_input) + return hidden_states + + +class BertParallelTransformerLayer(torch.nn.Module): + """A single layer transformer for Bert. + + We use the following notation: + h: hidden size + n: number of attention heads + b: batch size + s: sequence length + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + + Arguments: + hidden_size: The hidden size of the self attention. + intermediate_size: size of the intermediate state after + self attention. In both BERT and GPT + this is set to be 4 times the hidden + size. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + intermediate_activation_fn: activation function for output + of intermediate. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + """ + def __init__(self, + hidden_size, + intermediate_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + intermediate_activation_fn, + layernorm_epsilon, + init_method=init.xavier_normal_, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0): + super(BertParallelTransformerLayer, self).__init__() + + # Self attention. + self.attention = BertParallelSelfAttention(hidden_size, + num_attention_heads, + attention_dropout_prob, + output_parallel=True, + init_method=init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + # Self attention output. + self.self_output = BertParallelTransformerOutput( + hidden_size, hidden_size, output_dropout_prob, + layernorm_epsilon=layernorm_epsilon, + input_is_parallel=True, + init_method=init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + # Intermediate. + self.intermediate = ColumnParallelLinear(hidden_size, intermediate_size, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale,) + self.intermediate_activation_fn = intermediate_activation_fn + # Output. + self.output = BertParallelTransformerOutput( + intermediate_size, hidden_size, output_dropout_prob, + layernorm_epsilon=layernorm_epsilon, + input_is_parallel=True, + init_method=init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + + def forward(self, hidden_states, attention_mask, pruning_threshold=None,): + # [b, s, hp] + attention_output_parallel = self.attention(hidden_states, + attention_mask, + pruning_threshold=pruning_threshold) + # [b, s, h] + attention_self_output = self.self_output(attention_output_parallel, + hidden_states, pruning_threshold=pruning_threshold) + # [b, s, ip] + intermediate_output_parallel = self.intermediate(attention_self_output, pruning_threshold=pruning_threshold) + intermediate_output_parallel = self.intermediate_activation_fn( + intermediate_output_parallel) + # [b, s, h] + layer_output = self.output(intermediate_output_parallel, + attention_self_output, pruning_threshold=pruning_threshold) + + return layer_output + +class PalmParallelSelfAttention(torch.nn.Module): + """Parallel self-attention layer for GPT2. + + Self-attention layer takes input with size [b, s, h] where b is + the batch size, s is the sequence lenght, and h is the hidden size + and creates output of the same size. + Arguments: + hidden_size: total hidden size of the layer (h). + num_attention_heads: number of attention heads (n). Note that we + require n to be divisible by number of GPUs + used to parallelize the model. Also, we + require hidden size to be divisible by n. + dropout_prob: dropout probability for the attention scores. + init_method: weight initialization. + output_layer_init_method: output layer initialization. If None, use + `init_method`. + We use the following notation: + h: hidden_size + n: num_attention_heads + p: number of partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + """ + def __init__(self, hidden_size, num_attention_heads, + attention_dropout_prob, output_dropout_prob, + init_method, output_layer_init_method=None, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0): + super(GPT2ParallelSelfAttention, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size_per_partition = divide(hidden_size, world_size) + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = divide(num_attention_heads, + world_size) + # Strided linear layer. + self.query_key_value = ColumnParallelLinear(hidden_size, 3*hidden_size, + stride=3, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale,) + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) + + # Output. + self.dense = RowParallelLinear(hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + pruning_method=pruning_method, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, ) + self.output_dropout = torch.nn.Dropout(output_dropout_prob) + + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward(self, hidden_states, ltor_mask, pruning_threshold=None,): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Attention heads. [b, s, hp] + mixed_x_layer = self.query_key_value(hidden_states, pruning_threshold=pruning_threshold,) + (mixed_query_layer, + mixed_key_layer, + mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # Reshape and transpose [b, np, s, hn] + query_layer = self._transpose_for_scores(mixed_query_layer) + key_layer = self._transpose_for_scores(mixed_key_layer) + value_layer = self._transpose_for_scores(mixed_value_layer) + + # Raw attention scores. [b, np, s, s] + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt( + self.hidden_size_per_attention_head) + # Apply the left to right attention mask. + attention_scores = torch.mul(attention_scores, ltor_mask) - \ + 10000.0 * (1.0 - ltor_mask) + + # Attention probabilities. [b, np, s, s] + attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + with get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + + # Context layer. + # [b, np, s, hn] + context_layer = torch.matmul(attention_probs, value_layer) + # [b, s, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + # [b, s, hp] + context_layer = context_layer.view(*new_context_layer_shape) + + # Output. [b, s, h] + output = self.dense(context_layer, pruning_threshold=pruning_threshold,) + output = self.output_dropout(output) + + return output + +class PalmParallelCrossAttention(torch.nn.Module): + """Parallel self-attention layer for GPT2. + + Self-attention layer takes input with size [b, s, h] where b is + the batch size, s is the sequence lenght, and h is the hidden size + and creates output of the same size. + Arguments: + hidden_size: total hidden size of the layer (h). + num_attention_heads: number of attention heads (n). Note that we + require n to be divisible by number of GPUs + used to parallelize the model. Also, we + require hidden size to be divisible by n. + dropout_prob: dropout probability for the attention scores. + init_method: weight initialization. + output_layer_init_method: output layer initialization. If None, use + `init_method`. + We use the following notation: + h: hidden_size + n: num_attention_heads + p: number of partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + """ + def __init__(self, hidden_size, num_attention_heads, + attention_dropout_prob, output_dropout_prob, + init_method, output_layer_init_method=None, attn_separate=False, + pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0, pruning_module='all', + LR_weight_rank=8, LR_mask_rank=8): + super(PalmParallelCrossAttention, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size_per_partition = divide(hidden_size, world_size) + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = divide(num_attention_heads, + world_size) + # Strided linear layer. + self.query = ColumnParallelLinear(hidden_size, hidden_size, + stride=1, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method if pruning_module in ['all', 'decoder', 'decoder_cross', 'cross'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + + if not attn_separate: + self.key_value = ColumnParallelLinear(hidden_size, 2*hidden_size, + stride=2, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method if pruning_module in ['all', 'encoder', 'encoder_cross', 'cross'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + else: + self.key = ColumnParallelLinear(hidden_size, hidden_size, + stride=1, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method if pruning_module in ['all', 'decoder', 'decoder_cross', 'cross'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + self.value = ColumnParallelLinear(hidden_size, hidden_size, + stride=1, + gather_output=False, + init_method=init_method, + pruning_method=pruning_method if pruning_module in ['all', 'decoder', 'decoder_cross', 'cross'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) + + # Output. + self.dense = RowParallelLinear(hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + pruning_method=pruning_method if pruning_module in ['all', 'decoder', 'decoder_cross', 'cross'] else None, + pruning_mask_init=pruning_mask_init, + pruning_mask_scale=pruning_mask_scale, + LR_weight_rank=LR_weight_rank, + LR_mask_rank=LR_mask_rank) + self.output_dropout = torch.nn.Dropout(output_dropout_prob) + self.attn_separate = attn_separate + if deepspeed.checkpointing.is_configured(): + global get_cuda_rng_tracker, checkpoint + get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + checkpoint = deepspeed.checkpointing.checkpoint + + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward(self, query, enc_hidden_states, enc_attn_mask, pruning_threshold=None,): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + if pruning_threshold is not None: + key_pruning_threshold = pruning_threshold + # value_pruning_threshold = pruning_threshold if pruning_threshold > 0.03 else 0.03 + # dense_pruning_threshold = pruning_threshold if pruning_threshold > 0.03 else 0.03 + # value_pruning_threshold = 1 - (1 - pruning_threshold)/0.99*0.95 + # dense_pruning_threshold = 1 - (1 - pruning_threshold)/0.99*0.95 + + value_pruning_threshold = pruning_threshold + dense_pruning_threshold = pruning_threshold + + # Attention heads. [b, s, hp] + mixed_query_layer = self.query(query, pruning_threshold=pruning_threshold,) + #print_rank_0(enc_hidden_states.size()) + if not self.attn_separate: + mixed_x_layer = self.key_value(enc_hidden_states, pruning_threshold=pruning_threshold,) + (mixed_key_layer, mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 2) + else: + mixed_key_layer = self.key(enc_hidden_states, pruning_threshold=pruning_threshold,) + mixed_value_layer = self.value(enc_hidden_states, pruning_threshold=pruning_threshold,) + + # Reshape and transpose [b, np, s, hn] + query_layer = self._transpose_for_scores(mixed_query_layer) + key_layer = self._transpose_for_scores(mixed_key_layer) + value_layer = self._transpose_for_scores(mixed_value_layer) + + # Raw attention scores. [b, np, s, s] + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt( + self.hidden_size_per_attention_head) + attention_scores += enc_attn_mask + + # Attention probabilities. [b, np, s, s] + attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + with get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + + # Context layer. + # [b, np, s, hn] + context_layer = torch.matmul(attention_probs, value_layer) + # [b, s, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + # [b, s, hp] + context_layer = context_layer.view(*new_context_layer_shape) + + # Output. [b, s, h] + output = self.dense(context_layer, pruning_threshold=pruning_threshold,) + output = self.output_dropout(output) + + return output diff --git a/modelscope/utils/nlp/mpu/utils.py b/modelscope/utils/nlp/mpu/utils.py new file mode 100755 index 00000000..94afafd5 --- /dev/null +++ b/modelscope/utils/nlp/mpu/utils.py @@ -0,0 +1,70 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch + + +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, '{} is not divisible by {}'.format( + numerator, denominator) + + +def divide(numerator, denominator): + """Ensure that numerator is divisible by the denominator and return + the division value.""" + ensure_divisibility(numerator, denominator) + return numerator // denominator + + +def split_tensor_along_last_dim(tensor, num_partitions, + contiguous_split_chunks=False): + """Split a tensor along its last dimension. + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = divide(tensor.size()[last_dim], num_partitions) + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +class VocabUtility: + """Split the vocabulary into `world_size` chunks amd return the + first and last index of the vocabulary belonging to the `rank` + partition: Note that indecies in [fist, last)""" + + @staticmethod + def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, + rank, world_size): + index_f = rank * per_partition_vocab_size + index_l = index_f + per_partition_vocab_size + return index_f, index_l + + @staticmethod + def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): + per_partition_vocab_size = divide(global_vocab_size, world_size) + return VocabUtility.vocab_range_from_per_partition_vocab_size( + per_partition_vocab_size, rank, world_size) diff --git a/modelscope/utils/nlp/utils.py b/modelscope/utils/nlp/utils.py new file mode 100644 index 00000000..19bba16a --- /dev/null +++ b/modelscope/utils/nlp/utils.py @@ -0,0 +1,109 @@ + +import os +import random +import time +import numpy as np +import torch + +"""Utilities for logging and serialization""" + +def get_log_constant(user_log): + return '[user log]' if user_log else '' + +def print_rank_0(message): + if torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + print(message, flush=True) + else: + print(message, flush=True) + + +def print_args(args): + """Print arguments.""" + + print('arguments:', flush=True) + for arg in vars(args): + dots = '.' * (29 - len(arg)) + print(' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True) + + +def report_memory(name): + """Simple GPU memory report.""" + + mega_bytes = 1024.0 * 1024.0 + string = name + ' memory (MB)' + string += ' | allocated: {}'.format( + torch.cuda.memory_allocated() / mega_bytes) + string += ' | max allocated: {}'.format( + torch.cuda.max_memory_allocated() / mega_bytes) + string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes) + string += ' | max cached: {}'.format( + torch.cuda.max_memory_cached()/ mega_bytes) + print_rank_0(string) + + +class Timers: + """Group of timers.""" + + class Timer: + """Timer.""" + + def __init__(self, name): + self.name_ = name + self.elapsed_ = 0.0 + self.started_ = False + self.start_time = time.time() + + def start(self): + """Start the timer.""" + assert not self.started_, 'timer has already been started' + torch.cuda.synchronize() + self.start_time = time.time() + self.started_ = True + + def stop(self): + """Stop the timer.""" + assert self.started_, 'timer is not started' + torch.cuda.synchronize() + self.elapsed_ += (time.time() - self.start_time) + self.started_ = False + + def reset(self): + """Reset timer.""" + self.elapsed_ = 0.0 + self.started_ = False + + def elapsed(self, reset=True): + """Calculate the elapsed time.""" + started_ = self.started_ + # If the timing in progress, end it first. + if self.started_: + self.stop() + # Get the elapsed time. + elapsed_ = self.elapsed_ + # Reset the elapsed time + if reset: + self.reset() + # If timing was in progress, set it back. + if started_: + self.start() + return elapsed_ + + def __init__(self): + self.timers = {} + + def __call__(self, name): + if name not in self.timers: + self.timers[name] = self.Timer(name) + return self.timers[name] + + def log(self, names, normalizer=1.0, reset=True): + """Log a group of timers.""" + assert normalizer > 0.0 + string = 'time (ms)' + for name in names: + elapsed_time = self.timers[name].elapsed( + reset=reset) * 1000.0/ normalizer + string += ' | {}: {:.2f}'.format(name, elapsed_time) + print_rank_0(string) + diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py index 1f157f9a..ee9dde13 100644 --- a/modelscope/utils/torch_utils.py +++ b/modelscope/utils/torch_utils.py @@ -50,13 +50,13 @@ def _init_dist_pytorch(backend: str, **kwargs) -> None: # rank = int(os.environ['RANK']) local_rank = int(os.environ['LOCAL_RANK']) - torch.cuda.set_device(local_rank) + #torch.cuda.set_device(local_rank) dist.init_process_group(backend=backend, **kwargs) def _init_dist_mpi(backend: str, **kwargs) -> None: local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) - torch.cuda.set_device(local_rank) + #torch.cuda.set_device(local_rank) if 'MASTER_PORT' not in os.environ: # 29500 is torch.distributed default port os.environ['MASTER_PORT'] = '29500' @@ -82,7 +82,7 @@ def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None: ntasks = int(os.environ['SLURM_NTASKS']) node_list = os.environ['SLURM_NODELIST'] num_gpus = torch.cuda.device_count() - torch.cuda.set_device(proc_id % num_gpus) + #torch.cuda.set_device(proc_id % num_gpus) addr = subprocess.getoutput( f'scontrol show hostname {node_list} | head -n1') # specify master port diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py index c08209a4..3a56a941 100644 --- a/tests/pipelines/test_text_generation.py +++ b/tests/pipelines/test_text_generation.py @@ -3,7 +3,7 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration +from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration, PlugForTextGeneration from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextGenerationPipeline from modelscope.preprocessors import TextGenerationPreprocessor @@ -34,83 +34,15 @@ class TextGenerationTest(unittest.TestCase): self.gpt3_large_model_id = 'damo/nlp_gpt3_text-generation_chinese-large' self.gpt3_input = '《故乡》。深蓝的天空中挂着一轮金黄的圆月,下面是海边的沙地,' - def run_pipeline_with_model_instance(self, model_id, input): - model = Model.from_pretrained(model_id) - preprocessor = TextGenerationPreprocessor( - model.model_dir, - model.tokenizer, - first_sequence='sentence', - second_sequence=None) - pipeline_ins = pipeline( - task=Tasks.text_generation, model=model, preprocessor=preprocessor) - print(pipeline_ins(input)) + self.plug_model_id = 'damo/nlp_plug_text-generation_chinese' + self.plug_input = '段誉轻挥折扇,摇了摇头,说' - def run_pipeline_with_model_id(self, model_id, input): - pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id) - print(pipeline_ins(input)) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_palm_zh_with_model_name(self): - self.run_pipeline_with_model_id(self.palm_model_id_zh, - self.palm_input_zh) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_palm_en_with_model_name(self): - self.run_pipeline_with_model_id(self.palm_model_id_en, - self.palm_input_en) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_gpt_base_with_model_name(self): - self.run_pipeline_with_model_id(self.gpt3_base_model_id, - self.gpt3_input) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_gpt_large_with_model_name(self): - self.run_pipeline_with_model_id(self.gpt3_large_model_id, - self.gpt3_input) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_palm_zh_with_model_instance(self): - self.run_pipeline_with_model_instance(self.palm_model_id_zh, - self.palm_input_zh) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_palm_en_with_model_instance(self): - self.run_pipeline_with_model_instance(self.palm_model_id_en, - self.palm_input_en) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_gpt_base_with_model_instance(self): - self.run_pipeline_with_model_instance(self.gpt3_base_model_id, - self.gpt3_input) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_gpt_large_with_model_instance(self): - self.run_pipeline_with_model_instance(self.gpt3_large_model_id, - self.gpt3_input) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_run_palm(self): - for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh), - (self.palm_model_id_en, self.palm_input_en)): - cache_path = snapshot_download(model_id) - model = PalmForTextGeneration.from_pretrained(cache_path) - preprocessor = TextGenerationPreprocessor( - cache_path, - model.tokenizer, - first_sequence='sentence', - second_sequence=None) - pipeline1 = TextGenerationPipeline(model, preprocessor) - pipeline2 = pipeline( - Tasks.text_generation, model=model, preprocessor=preprocessor) - print( - f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}' - ) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_run_gpt3(self): - cache_path = snapshot_download(self.gpt3_base_model_id) - model = GPT3ForTextGeneration(cache_path) + def test_plug(self): + import torch + print("start_method", str(torch.multiprocessing.get_start_method(allow_none=True))) + torch.multiprocessing.set_start_method("spawn") + cache_path = "/home/suluyan.sly/model/plug_model" + model = PlugForTextGeneration(cache_path) preprocessor = TextGenerationPreprocessor( cache_path, model.tokenizer, @@ -120,13 +52,102 @@ class TextGenerationTest(unittest.TestCase): pipeline2 = pipeline( Tasks.text_generation, model=model, preprocessor=preprocessor) print( - f'pipeline1: {pipeline1(self.gpt3_input)}\npipeline2: {pipeline2(self.gpt3_input)}' + f'pipeline1: {pipeline1(self.plug_input)}\npipeline2: {pipeline2(self.plug_input)}' ) - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_run_with_default_model(self): - pipeline_ins = pipeline(task=Tasks.text_generation) - print(pipeline_ins(self.palm_input_zh)) + # def run_pipeline_with_model_instance(self, model_id, input): + # model = Model.from_pretrained(model_id) + # preprocessor = TextGenerationPreprocessor( + # model.model_dir, + # model.tokenizer, + # first_sequence='sentence', + # second_sequence=None) + # pipeline_ins = pipeline( + # task=Tasks.text_generation, model=model, preprocessor=preprocessor) + # print(pipeline_ins(input)) + + # def run_pipeline_with_model_id(self, model_id, input): + # pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id) + # print(pipeline_ins(input)) + + # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + # def test_palm_zh_with_model_name(self): + # self.run_pipeline_with_model_id(self.palm_model_id_zh, + # self.palm_input_zh) + + # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + # def test_palm_en_with_model_name(self): + # self.run_pipeline_with_model_id(self.palm_model_id_en, + # self.palm_input_en) + + # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + # def test_gpt_base_with_model_name(self): + # self.run_pipeline_with_model_id(self.gpt3_base_model_id, + # self.gpt3_input) + + # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + # def test_gpt_large_with_model_name(self): + # self.run_pipeline_with_model_id(self.gpt3_large_model_id, + # self.gpt3_input) + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + # def test_palm_zh_with_model_instance(self): + # self.run_pipeline_with_model_instance(self.palm_model_id_zh, + # self.palm_input_zh) + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + # def test_palm_en_with_model_instance(self): + # self.run_pipeline_with_model_instance(self.palm_model_id_en, + # self.palm_input_en) + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + # def test_gpt_base_with_model_instance(self): + # self.run_pipeline_with_model_instance(self.gpt3_base_model_id, + # self.gpt3_input) + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + # def test_gpt_large_with_model_instance(self): + # self.run_pipeline_with_model_instance(self.gpt3_large_model_id, + # self.gpt3_input) + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + # def test_run_palm(self): + # for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh), + # (self.palm_model_id_en, self.palm_input_en)): + # cache_path = snapshot_download(model_id) + # model = PalmForTextGeneration.from_pretrained(cache_path) + # preprocessor = TextGenerationPreprocessor( + # cache_path, + # model.tokenizer, + # first_sequence='sentence', + # second_sequence=None) + # pipeline1 = TextGenerationPipeline(model, preprocessor) + # pipeline2 = pipeline( + # Tasks.text_generation, model=model, preprocessor=preprocessor) + # print( + # f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}' + # ) + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + # def test_run_gpt3(self): + # cache_path = snapshot_download(self.gpt3_base_model_id) + # model = GPT3ForTextGeneration(cache_path) + # preprocessor = TextGenerationPreprocessor( + # cache_path, + # model.tokenizer, + # first_sequence='sentence', + # second_sequence=None) + # pipeline1 = TextGenerationPipeline(model, preprocessor) + # pipeline2 = pipeline( + # Tasks.text_generation, model=model, preprocessor=preprocessor) + # print( + # f'pipeline1: {pipeline1(self.gpt3_input)}\npipeline2: {pipeline2(self.gpt3_input)}' + # ) + + # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + # def test_run_with_default_model(self): + # pipeline_ins = pipeline(task=Tasks.text_generation) + # print(pipeline_ins(self.palm_input_zh)) if __name__ == '__main__':