mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 20:19:51 +01:00
runable but unfinished
This commit is contained in:
@@ -32,6 +32,7 @@ class Models(object):
|
||||
tcrf = 'transformer-crf'
|
||||
bart = 'bart'
|
||||
gpt3 = 'gpt3'
|
||||
plug = 'plug'
|
||||
|
||||
# audio models
|
||||
sambert_hifigan = 'sambert-hifigan'
|
||||
|
||||
@@ -20,6 +20,7 @@ if TYPE_CHECKING:
|
||||
from .task_models.task_model import SingleBackboneTaskModelBase
|
||||
from .bart_for_text_error_correction import BartForTextErrorCorrection
|
||||
from .gpt3 import GPT3ForTextGeneration
|
||||
from .plug import PlugForTextGeneration
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
@@ -42,6 +43,7 @@ else:
|
||||
'task_model': ['SingleBackboneTaskModelBase'],
|
||||
'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
|
||||
'gpt3': ['GPT3ForTextGeneration'],
|
||||
'plug': ['PlugForTextGeneration'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
27
modelscope/models/nlp/plug/__init__.py
Normal file
27
modelscope/models/nlp/plug/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_plug import PlugNLGConfig
|
||||
from .modeling_plug import PlugModel
|
||||
from .distributed_plug import DistributedPlug
|
||||
from .plug_for_text_generation import PlugForTextGeneration
|
||||
else:
|
||||
_import_structure = {
|
||||
'configuration_plug': ['PlugNLGConfig'],
|
||||
'modeling_plug': ['PlugModel'],
|
||||
'distributed_plug': ['DistributedPlug'],
|
||||
'plug_for_text_generation': ['PlugForTextGeneration'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
414
modelscope/models/nlp/plug/arguments.py
Executable file
414
modelscope/models/nlp/plug/arguments.py
Executable file
@@ -0,0 +1,414 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""argparser configuration"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import torch
|
||||
import deepspeed
|
||||
|
||||
|
||||
def add_model_config_args(parser):
|
||||
"""Model arguments"""
|
||||
|
||||
group = parser.add_argument_group('model', 'model configuration')
|
||||
|
||||
group.add_argument('--pretrained-bert', action='store_true',
|
||||
help='use a pretrained bert-large-uncased model instead'
|
||||
'of initializing from scratch. See '
|
||||
'--tokenizer-model-type to specify which pretrained '
|
||||
'BERT model to use')
|
||||
group.add_argument('--attention-dropout', type=float, default=0.1,
|
||||
help='dropout probability for attention weights')
|
||||
group.add_argument('--num-attention-heads', type=int, default=16,
|
||||
help='num of transformer attention heads')
|
||||
group.add_argument('--hidden-size', type=int, default=1024,
|
||||
help='tansformer hidden size')
|
||||
group.add_argument('--intermediate-size', type=int, default=None,
|
||||
help='transformer embedding dimension for FFN'
|
||||
'set to 4*`--hidden-size` if it is None')
|
||||
group.add_argument('--num-layers', type=int, default=24,
|
||||
help='num decoder layers')
|
||||
group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
|
||||
help='layer norm epsilon')
|
||||
group.add_argument('--hidden-dropout', type=float, default=0.1,
|
||||
help='dropout probability for hidden state transformer')
|
||||
group.add_argument('--max-position-embeddings', type=int, default=512,
|
||||
help='maximum number of position embeddings to use')
|
||||
group.add_argument('--vocab-size', type=int, default=30522,
|
||||
help='vocab size to use for non-character-level '
|
||||
'tokenization. This value will only be used when '
|
||||
'creating a tokenizer')
|
||||
group.add_argument('--deep-init', action='store_true',
|
||||
help='initialize bert model similar to gpt2 model.'
|
||||
'scales initialization of projection layers by a '
|
||||
'factor of 1/sqrt(2N). Necessary to train bert '
|
||||
'models larger than BERT-Large.')
|
||||
group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
|
||||
help='Pad the vocab size to be divisible by this value.'
|
||||
'This is added for computational efficieny reasons.')
|
||||
group.add_argument('--cpu-optimizer', action='store_true',
|
||||
help='Run optimizer on CPU')
|
||||
group.add_argument('--cpu_torch_adam', action='store_true',
|
||||
help='Use Torch Adam as optimizer on CPU.')
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def add_fp16_config_args(parser):
|
||||
"""Mixed precision arguments."""
|
||||
|
||||
group = parser.add_argument_group('fp16', 'fp16 configurations')
|
||||
|
||||
group.add_argument('--fp16', action='store_true',
|
||||
help='Run model in fp16 mode')
|
||||
group.add_argument('--fp32-embedding', action='store_true',
|
||||
help='embedding in fp32')
|
||||
group.add_argument('--fp32-layernorm', action='store_true',
|
||||
help='layer norm in fp32')
|
||||
group.add_argument('--fp32-tokentypes', action='store_true',
|
||||
help='embedding token types in fp32')
|
||||
group.add_argument('--fp32-allreduce', action='store_true',
|
||||
help='all-reduce in fp32')
|
||||
group.add_argument('--hysteresis', type=int, default=2,
|
||||
help='hysteresis for dynamic loss scaling')
|
||||
group.add_argument('--loss-scale', type=float, default=None,
|
||||
help='Static loss scaling, positive power of 2 '
|
||||
'values can improve fp16 convergence. If None, dynamic'
|
||||
'loss scaling is used.')
|
||||
group.add_argument('--loss-scale-window', type=float, default=1000,
|
||||
help='Window over which to raise/lower dynamic scale')
|
||||
group.add_argument('--min-scale', type=float, default=1,
|
||||
help='Minimum loss scale for dynamic loss scale')
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def add_training_args(parser):
|
||||
"""Training arguments."""
|
||||
|
||||
group = parser.add_argument_group('train', 'training configurations')
|
||||
|
||||
group.add_argument('--batch-size', type=int, default=4,
|
||||
help='Data Loader batch size')
|
||||
group.add_argument('--weight-decay', type=float, default=0.01,
|
||||
help='weight decay coefficient for L2 regularization')
|
||||
group.add_argument('--checkpoint-activations', action='store_true',
|
||||
help='checkpoint activation to allow for training '
|
||||
'with larger models and sequences')
|
||||
group.add_argument('--checkpoint-num-layers', type=int, default=1,
|
||||
help='chunk size (number of layers) for checkpointing')
|
||||
group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
|
||||
help='uses activation checkpointing from deepspeed')
|
||||
group.add_argument('--clip-grad', type=float, default=1.0,
|
||||
help='gradient clipping')
|
||||
group.add_argument('--train-iters', type=int, default=1000000,
|
||||
help='total number of iterations to train over all training runs')
|
||||
group.add_argument('--log-interval', type=int, default=100,
|
||||
help='report interval')
|
||||
group.add_argument('--exit-interval', type=int, default=None,
|
||||
help='Exit the program after this many new iterations.')
|
||||
|
||||
group.add_argument('--seed', type=int, default=1234,
|
||||
help='random seed')
|
||||
# Batch prodecuer arguments
|
||||
group.add_argument('--reset-position-ids', action='store_true',
|
||||
help='Reset posistion ids after end-of-document token.')
|
||||
group.add_argument('--reset-attention-mask', action='store_true',
|
||||
help='Reset self attention maske after '
|
||||
'end-of-document token.')
|
||||
|
||||
# Learning rate.
|
||||
group.add_argument('--lr-decay-iters', type=int, default=None,
|
||||
help='number of iterations to decay LR over,'
|
||||
' If None defaults to `--train-iters`*`--epochs`')
|
||||
group.add_argument('--lr-decay-style', type=str, default='linear',
|
||||
choices=['constant', 'linear', 'cosine', 'exponential'],
|
||||
help='learning rate decay function')
|
||||
group.add_argument('--lr', type=float, default=1.0e-4,
|
||||
help='initial learning rate')
|
||||
group.add_argument('--warmup', type=float, default=0.01,
|
||||
help='percentage of data to warmup on (.01 = 1% of all '
|
||||
'training iters). Default 0.01')
|
||||
group.add_argument('--batch-warmup', type=float, default=0.01,
|
||||
help='percentage of data to warmup on (.01 = 1% of all '
|
||||
'training iters). Default 0.01')
|
||||
group.add_argument('--length-warmup', type=float, default=0.01,
|
||||
help='percentage of data to warmup on (.01 = 1% of all '
|
||||
'training iters). Default 0.01')
|
||||
# model checkpointing
|
||||
group.add_argument('--save', type=str, default=None,
|
||||
help='Output directory to save checkpoints to.')
|
||||
group.add_argument('--save-interval', type=int, default=2000,
|
||||
help='number of iterations between saves')
|
||||
group.add_argument('--no-save-optim', action='store_true',
|
||||
help='Do not save current optimizer.')
|
||||
group.add_argument('--no-save-rng', action='store_true',
|
||||
help='Do not save current rng state.')
|
||||
group.add_argument('--load', type=str, default=None,
|
||||
help='Path to a directory containing a model checkpoint.')
|
||||
group.add_argument('--load-iteration', type=str, default=0,
|
||||
help='Load iteration of a model checkpoint.')
|
||||
group.add_argument('--pre-load', action='store_true',
|
||||
help='Use pre-load instead of deepspeed load.')
|
||||
group.add_argument('--no-load-optim', action='store_true',
|
||||
help='Do not load optimizer when loading checkpoint.')
|
||||
group.add_argument('--no-load-rng', action='store_true',
|
||||
help='Do not load rng state when loading checkpoint.')
|
||||
group.add_argument('--no-load-lr', action='store_true',
|
||||
help='Do not load lr schedule when loading checkpoint.')
|
||||
group.add_argument('--finetune', action='store_true',
|
||||
help='Load model for finetuning. Do not load optimizer '
|
||||
'or rng state from checkpoint and set iteration to 0. '
|
||||
'Assumed when loading a release checkpoint.')
|
||||
group.add_argument('--resume-dataloader', action='store_true',
|
||||
help='Resume the dataloader when resuming training. '
|
||||
'Does not apply to tfrecords dataloader, try resuming'
|
||||
'with a different seed in this case.')
|
||||
# distributed training args
|
||||
group.add_argument('--distributed-backend', default='nccl',
|
||||
help='which backend to use for distributed '
|
||||
'training. One of [gloo, nccl]')
|
||||
|
||||
group.add_argument('--local_rank', type=int, default=None,
|
||||
help='local rank passed from distributed launcher')
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def add_evaluation_args(parser):
|
||||
"""Evaluation arguments."""
|
||||
|
||||
group = parser.add_argument_group('validation', 'validation configurations')
|
||||
|
||||
group.add_argument('--eval-batch-size', type=int, default=None,
|
||||
help='Data Loader batch size for evaluation datasets.'
|
||||
'Defaults to `--batch-size`')
|
||||
group.add_argument('--eval-iters', type=int, default=100,
|
||||
help='number of iterations to run for evaluation'
|
||||
'validation/test for')
|
||||
group.add_argument('--eval-interval', type=int, default=1000,
|
||||
help='interval between running evaluation on validation set')
|
||||
group.add_argument('--eval-seq-length', type=int, default=None,
|
||||
help='Maximum sequence length to process for '
|
||||
'evaluation. Defaults to `--seq-length`')
|
||||
group.add_argument('--eval-max-preds-per-seq', type=int, default=None,
|
||||
help='Maximum number of predictions to use for '
|
||||
'evaluation. Defaults to '
|
||||
'math.ceil(`--eval-seq-length`*.15/10)*10')
|
||||
group.add_argument('--overlapping-eval', type=int, default=32,
|
||||
help='sliding window for overlapping eval ')
|
||||
group.add_argument('--cloze-eval', action='store_true',
|
||||
help='Evaluation dataset from `--valid-data` is a cloze task')
|
||||
group.add_argument('--eval-hf', action='store_true',
|
||||
help='perform evaluation with huggingface openai model.'
|
||||
'use `--load` to specify weights path to be loaded')
|
||||
group.add_argument('--load-openai', action='store_true',
|
||||
help='load openai weights into our model. Use `--load` '
|
||||
'to specify weights path to be loaded')
|
||||
|
||||
return parser
|
||||
|
||||
def add_text_generate_args(parser):
|
||||
"""Text generate arguments."""
|
||||
|
||||
group = parser.add_argument_group('Text generation', 'configurations')
|
||||
group.add_argument("--temperature", type=float, default=1.0)
|
||||
group.add_argument("--top_p", type=float, default=0.0)
|
||||
group.add_argument("--top_k", type=int, default=0)
|
||||
group.add_argument("--out-seq-length", type=int, default=256)
|
||||
return parser
|
||||
|
||||
def add_struct_args(parser):
|
||||
group = parser.add_argument_group('struct', 'struct configurations')
|
||||
group.add_argument("--gradient-accumulation-steps", type=int, default=1,
|
||||
help='Not Imp yet.')
|
||||
group.add_argument("--num-epochs", type=int, default=1,
|
||||
help='Not Imp yet.')
|
||||
group.add_argument("--struct-bert-dataset", action='store_true', default=False,
|
||||
help='Use struct bert dataset or not.')
|
||||
return parser
|
||||
|
||||
def add_palm_args(parser):
|
||||
group = parser.add_argument_group('palm', 'struct configurations')
|
||||
group.add_argument('--dec-layers', type=int, default=6,
|
||||
help='num decoder layers')
|
||||
group.add_argument('--tgt-length', type=int, default=100,
|
||||
help='num decoder layers')
|
||||
group.add_argument('--vae-size', type=int, default=8192,
|
||||
help='vae code vocab size')
|
||||
group.add_argument('--max-image-position', type=int, default=1025,
|
||||
help='max image decode position')
|
||||
group.add_argument("--palm-dataset", action='store_true', default=False,
|
||||
help='Use struct bert dataset or not.')
|
||||
group.add_argument("--image-dataset", action='store_true', default=False,
|
||||
help='Use struct bert dataset or not.')
|
||||
group.add_argument("--do-mask-lm", action='store_true', default=False,
|
||||
help='Do mask lm task or not.')
|
||||
group.add_argument('--vae-enc-model', type=str, default=None,
|
||||
help='Path to a directory containing a model checkpoint.')
|
||||
return parser
|
||||
|
||||
def add_downstream_args(parser):
|
||||
group = parser.add_argument_group('downstream', 'struct configurations')
|
||||
group.add_argument("--downstream-dataset", action='store_true', default=False,
|
||||
help='Use struct bert dataset or not.')
|
||||
group.add_argument("--task-name", default='ocnli', type=str)
|
||||
return parser
|
||||
|
||||
def add_data_args(parser):
|
||||
"""Train/valid/test data arguments."""
|
||||
|
||||
group = parser.add_argument_group('data', 'data configurations')
|
||||
|
||||
group.add_argument('--model-parallel-size', type=int, default=1,
|
||||
help='size of the model parallel.')
|
||||
group.add_argument('--shuffle', action='store_true',
|
||||
help='Shuffle data. Shuffling is deterministic '
|
||||
'based on seed and current epoch.')
|
||||
group.add_argument('--train-data', nargs='+', default=None,
|
||||
help='Whitespace separated filenames or corpora names '
|
||||
'for training.')
|
||||
|
||||
group.add_argument('--use-npy-data-loader', action='store_true',
|
||||
help='Use the numpy data loader. If set, then'
|
||||
'train-data-path, val-data-path, and test-data-path'
|
||||
'should also be provided.')
|
||||
group.add_argument('--train-data-path', type=str, default='',
|
||||
help='path to the training data')
|
||||
group.add_argument('--val-data-path', type=str, default='',
|
||||
help='path to the validation data')
|
||||
group.add_argument('--test-data-path', type=str, default='',
|
||||
help='path to the test data')
|
||||
group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
|
||||
help='the filename containing all the shards sizes')
|
||||
|
||||
group.add_argument('--delim', default=',',
|
||||
help='delimiter used to parse csv data files')
|
||||
group.add_argument('--text-key', default='sentence',
|
||||
help='key to use to extract text from json/csv')
|
||||
group.add_argument('--eval-text-key', default=None,
|
||||
help='key to use to extract text from '
|
||||
'json/csv evaluation datasets')
|
||||
group.add_argument('--valid-data', nargs='*', default=None,
|
||||
help="""Filename for validation data.""")
|
||||
group.add_argument('--split', default='1000,1,1',
|
||||
help='comma-separated list of proportions for training,'
|
||||
' validation, and test split')
|
||||
group.add_argument('--test-data', nargs='*', default=None,
|
||||
help="""Filename for testing""")
|
||||
|
||||
group.add_argument('--lazy-loader', action='store_true',
|
||||
help='whether to lazy read the data set')
|
||||
group.add_argument('--loose-json', action='store_true',
|
||||
help='Use loose json (one json-formatted string per '
|
||||
'newline), instead of tight json (data file is one '
|
||||
'json string)')
|
||||
group.add_argument('--presplit-sentences', action='store_true',
|
||||
help='Dataset content consists of documents where '
|
||||
'each document consists of newline separated sentences')
|
||||
group.add_argument('--num-workers', type=int, default=2,
|
||||
help="""Number of workers to use for dataloading""")
|
||||
group.add_argument('--tokenizer-model-type', type=str,
|
||||
default='bert-large-uncased',
|
||||
help="Model type to use for sentencepiece tokenization \
|
||||
(one of ['bpe', 'char', 'unigram', 'word']) or \
|
||||
bert vocab to use for BertWordPieceTokenizer (one of \
|
||||
['bert-large-uncased', 'bert-large-cased', etc.])")
|
||||
group.add_argument('--tokenizer-path', type=str, default='tokenizer.model',
|
||||
help='path used to save/load sentencepiece tokenization '
|
||||
'models')
|
||||
group.add_argument('--tokenizer-type', type=str,
|
||||
default='BertWordPieceTokenizer',
|
||||
choices=['CharacterLevelTokenizer',
|
||||
'SentencePieceTokenizer',
|
||||
'BertWordPieceTokenizer',
|
||||
'GPT2BPETokenizer'],
|
||||
help='what type of tokenizer to use')
|
||||
group.add_argument("--cache-dir", default=None, type=str,
|
||||
help="Where to store pre-trained BERT downloads")
|
||||
group.add_argument('--use-tfrecords', action='store_true',
|
||||
help='load `--train-data`, `--valid-data`, '
|
||||
'`--test-data` from BERT tf records instead of '
|
||||
'normal data pipeline')
|
||||
group.add_argument('--seq-length', type=int, default=512,
|
||||
help="Maximum sequence length to process")
|
||||
group.add_argument('--max-preds-per-seq', type=int, default=None,
|
||||
help='Maximum number of predictions to use per sequence.'
|
||||
'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
|
||||
'MUST BE SPECIFIED IF `--use-tfrecords` is True.')
|
||||
|
||||
return parser
|
||||
|
||||
def get_args():
|
||||
"""Parse all the args."""
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch BERT Model')
|
||||
parser = add_model_config_args(parser)
|
||||
parser = add_fp16_config_args(parser)
|
||||
parser = add_training_args(parser)
|
||||
parser = add_evaluation_args(parser)
|
||||
parser = add_text_generate_args(parser)
|
||||
parser = add_struct_args(parser)
|
||||
parser = add_palm_args(parser)
|
||||
parser = add_downstream_args(parser)
|
||||
parser = add_data_args(parser)
|
||||
|
||||
# Include DeepSpeed configuration arguments
|
||||
parser = deepspeed.add_config_arguments(parser)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
args.deepspeed = False
|
||||
|
||||
args.cuda = torch.cuda.is_available()
|
||||
|
||||
args.rank = int(os.getenv('RANK', '0'))
|
||||
args.world_size = int(os.getenv("WORLD_SIZE", '1'))
|
||||
|
||||
if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
|
||||
# We are using (OpenMPI) mpirun for launching distributed data parallel processes
|
||||
local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
|
||||
local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
|
||||
|
||||
# Possibly running with Slurm
|
||||
num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
|
||||
nodeid = int(os.getenv('SLURM_NODEID', '0'))
|
||||
|
||||
args.local_rank = local_rank
|
||||
args.rank = nodeid*local_size + local_rank
|
||||
args.world_size = num_nodes*local_size
|
||||
|
||||
args.model_parallel_size = min(args.model_parallel_size, args.world_size)
|
||||
if args.rank == 0:
|
||||
print('using world size: {} and model-parallel size: {} '.format(
|
||||
args.world_size, args.model_parallel_size))
|
||||
|
||||
args.dynamic_loss_scale = False
|
||||
if args.loss_scale is None:
|
||||
args.dynamic_loss_scale = True
|
||||
if args.rank == 0:
|
||||
print(' > using dynamic loss scaling')
|
||||
|
||||
# The args fp32_* or fp16_* meant to be active when the
|
||||
# args fp16 is set. So the default behaviour should all
|
||||
# be false.
|
||||
if not args.fp16:
|
||||
args.fp32_embedding = False
|
||||
args.fp32_tokentypes = False
|
||||
args.fp32_layernorm = False
|
||||
|
||||
return args
|
||||
368
modelscope/models/nlp/plug/configuration_plug.py
Normal file
368
modelscope/models/nlp/plug/configuration_plug.py
Normal file
@@ -0,0 +1,368 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import copy
|
||||
|
||||
""" BERT model configuration """
|
||||
from collections import OrderedDict
|
||||
from typing import Mapping
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from modelscope.utils import logger as logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class PlugNLUConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
|
||||
:class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
|
||||
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
|
||||
to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 30522):
|
||||
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
|
||||
:class:`~transformers.TFBertModel`.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
|
||||
:class:`~transformers.TFBertModel`.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
|
||||
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
|
||||
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
|
||||
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
|
||||
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
|
||||
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
|
||||
<https://arxiv.org/abs/2009.13658>`__.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if ``config.is_decoder=True``.
|
||||
classifier_dropout (:obj:`float`, `optional`):
|
||||
The dropout ratio for the classification head.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from transformers import BertModel, BertConfig
|
||||
|
||||
>>> # Initializing a BERT bert-base-uncased style configuration
|
||||
>>> configuration = BertConfig()
|
||||
|
||||
>>> # Initializing a model from the bert-base-uncased style configuration
|
||||
>>> model = BertModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type="plugNLU"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=21504,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
hidden_act='gelu',
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=2,
|
||||
initializer_range=0.00707,
|
||||
deep_init=False,
|
||||
deepspeed=False,
|
||||
lr_decay_style='linear',
|
||||
weight_decay=1e-2,
|
||||
clip_grad=1.0,
|
||||
warmup=0.01,
|
||||
pre_ln = False,
|
||||
fp16 = False,
|
||||
fp32_layernorm=False,
|
||||
fp32_embedding=False,
|
||||
fp32_tokentypes=False,
|
||||
layernorm_epsilon=1e-12,
|
||||
dec_hidden_layers=6,
|
||||
pruning_method=None,
|
||||
pruning_mask_init="constant",
|
||||
pruning_mask_scale=0.0,
|
||||
pruning_initial_threshold= 1.0,
|
||||
pruning_final_threshold=0.01,
|
||||
pruning_initial_warmup=1,
|
||||
pruning_final_warmup=20,
|
||||
pruning_module='decoder',
|
||||
pruning_decay_step=50,
|
||||
pruning_decay_type='exp',
|
||||
ft_module=None,
|
||||
attn_separate=True,
|
||||
LR_weight_rank=8,
|
||||
LR_mask_rank=8,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.deep_init = deep_init
|
||||
self.deepspeed = deepspeed
|
||||
self.lr_decay_style = lr_decay_style
|
||||
self.weight_decay = weight_decay
|
||||
self.clip_grad = clip_grad
|
||||
self.warmup = warmup
|
||||
self.pre_ln = pre_ln
|
||||
self.fp16 = fp16
|
||||
self.fp32_layernorm = fp32_layernorm
|
||||
self.fp32_embedding = fp32_embedding
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.fp32_tokentypes = fp32_tokentypes
|
||||
self.dec_hidden_layers = dec_hidden_layers
|
||||
self.pruning_method = pruning_method
|
||||
self.pruning_mask_init = pruning_mask_init
|
||||
self.pruning_mask_scale = pruning_mask_scale
|
||||
self.pruning_module = pruning_module
|
||||
self.pruning_initial_threshold = pruning_initial_threshold
|
||||
self.pruning_final_threshold = pruning_final_threshold
|
||||
self.pruning_initial_warmup = pruning_initial_warmup
|
||||
self.pruning_final_warmup = pruning_final_warmup
|
||||
self.pruning_decay_step = pruning_decay_step
|
||||
self.pruning_decay_type = pruning_decay_type
|
||||
self.ft_module = ft_module
|
||||
self.attn_separate = attn_separate
|
||||
self.LR_weight_rank = LR_weight_rank
|
||||
self.LR_mask_rank = LR_mask_rank
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, json_object):
|
||||
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
|
||||
config = PlugNLUConfig()
|
||||
for key, value in json_object.items():
|
||||
config.__dict__[key] = value
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def from_json_file(cls, json_file):
|
||||
"""Constructs a `BertConfig` from a json file of parameters."""
|
||||
with open(json_file, "r", encoding='utf-8') as reader:
|
||||
text = reader.read()
|
||||
return cls.from_dict(json.loads(text))
|
||||
|
||||
def merge_args(self, args):
|
||||
"""merge values a `BertConfig` from a json file of parameters."""
|
||||
local_keys = self.__dict__.keys()
|
||||
for key, value in args.__dict__.items():
|
||||
if key in local_keys:
|
||||
continue
|
||||
self.__dict__[key] = value
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_json_string())
|
||||
|
||||
def to_dict(self):
|
||||
"""Serializes this instance to a Python dictionary."""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
return output
|
||||
|
||||
def to_json_string(self):
|
||||
"""Serializes this instance to a JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||
|
||||
class PlugNLGConfig(PlugNLUConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
|
||||
:class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
|
||||
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
|
||||
to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 30522):
|
||||
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
|
||||
:class:`~transformers.TFBertModel`.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
|
||||
:class:`~transformers.TFBertModel`.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
|
||||
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
|
||||
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
|
||||
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
|
||||
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
|
||||
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
|
||||
<https://arxiv.org/abs/2009.13658>`__.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if ``config.is_decoder=True``.
|
||||
classifier_dropout (:obj:`float`, `optional`):
|
||||
The dropout ratio for the classification head.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from transformers import BertModel, BertConfig
|
||||
|
||||
>>> # Initializing a BERT bert-base-uncased style configuration
|
||||
>>> configuration = BertConfig()
|
||||
|
||||
>>> # Initializing a model from the bert-base-uncased style configuration
|
||||
>>> model = BertModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type="plugNLG"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=21504,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
hidden_act='gelu',
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=2,
|
||||
initializer_range=0.00707,
|
||||
deep_init=False,
|
||||
deepspeed=False,
|
||||
lr_decay_style='linear',
|
||||
weight_decay=1e-2,
|
||||
clip_grad=1.0,
|
||||
warmup=0.01,
|
||||
pre_ln = False,
|
||||
fp16 = False,
|
||||
fp32_layernorm=False,
|
||||
fp32_embedding=False,
|
||||
fp32_tokentypes=False,
|
||||
layernorm_epsilon=1e-12,
|
||||
dec_hidden_layers=6,
|
||||
pruning_method=None,
|
||||
pruning_mask_init="constant",
|
||||
pruning_mask_scale=0.0,
|
||||
pruning_initial_threshold= 1.0,
|
||||
pruning_final_threshold=0.01,
|
||||
pruning_initial_warmup=1,
|
||||
pruning_final_warmup=20,
|
||||
pruning_module='decoder',
|
||||
pruning_decay_step=50,
|
||||
pruning_decay_type='exp',
|
||||
ft_module=None,
|
||||
attn_separate=False,
|
||||
LR_weight_rank=8,
|
||||
LR_mask_rank=8,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.deep_init = deep_init
|
||||
self.deepspeed = deepspeed
|
||||
self.lr_decay_style = lr_decay_style
|
||||
self.weight_decay = weight_decay
|
||||
self.clip_grad = clip_grad
|
||||
self.warmup = warmup
|
||||
self.pre_ln = pre_ln
|
||||
self.fp16 = fp16
|
||||
self.fp32_layernorm = fp32_layernorm
|
||||
self.fp32_embedding = fp32_embedding
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.fp32_tokentypes = fp32_tokentypes
|
||||
self.dec_hidden_layers = dec_hidden_layers
|
||||
self.pruning_method = pruning_method
|
||||
self.pruning_mask_init = pruning_mask_init
|
||||
self.pruning_mask_scale = pruning_mask_scale
|
||||
self.pruning_module = pruning_module
|
||||
self.pruning_initial_threshold = pruning_initial_threshold
|
||||
self.pruning_final_threshold = pruning_final_threshold
|
||||
self.pruning_initial_warmup = pruning_initial_warmup
|
||||
self.pruning_final_warmup = pruning_final_warmup
|
||||
self.pruning_decay_step = pruning_decay_step
|
||||
self.pruning_decay_type = pruning_decay_type
|
||||
self.ft_module = ft_module
|
||||
self.attn_separate = attn_separate
|
||||
self.LR_weight_rank = LR_weight_rank
|
||||
self.LR_mask_rank = LR_mask_rank
|
||||
198
modelscope/models/nlp/plug/distributed_plug.py
Normal file
198
modelscope/models/nlp/plug/distributed_plug.py
Normal file
@@ -0,0 +1,198 @@
|
||||
import random
|
||||
import torch
|
||||
import numpy as np
|
||||
import torch.nn.functional as F
|
||||
from typing import Dict
|
||||
|
||||
from . import PlugModel
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.utils.nlp import mpu
|
||||
from modelscope.utils.nlp.utils import print_rank_0
|
||||
from modelscope.utils.nlp.fp16 import FP16_Module
|
||||
from modelscope.utils.nlp.distributed import DistributedDataParallel as DDP
|
||||
|
||||
import os
|
||||
from modelscope.utils.torch_utils import init_dist
|
||||
def initialize_distributed(rank):
|
||||
"""Initialize torch.distributed."""
|
||||
# Manually set the device ids.
|
||||
#torch.multiprocessing.set_start_method("spawn")
|
||||
device = rank % torch.cuda.device_count()
|
||||
torch.cuda.set_device(device)
|
||||
# Call the init process
|
||||
init_method = 'tcp://'
|
||||
master_ip = os.getenv('MASTER_ADDR', '127.0.0.1')
|
||||
master_port = os.getenv('MASTER_PORT', '12345')
|
||||
init_method += master_ip + ':' + master_port
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
world_size=8, rank=rank,
|
||||
init_method=init_method)
|
||||
# Set the model-parallel communicators.
|
||||
mpu.initialize_model_parallel(8)
|
||||
|
||||
def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
|
||||
# This function has been mostly taken from huggingface conversational ai code at
|
||||
# https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
|
||||
|
||||
if top_k > 0:
|
||||
# Remove all tokens with a probability less than the last token of the top-k
|
||||
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
|
||||
logits[indices_to_remove] = filter_value
|
||||
|
||||
if top_p > 0.0:
|
||||
#convert to 1D
|
||||
logits=logits.view(logits.size()[1]).contiguous()
|
||||
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
||||
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
|
||||
|
||||
# Remove tokens with cumulative probability above the threshold
|
||||
sorted_indices_to_remove = cumulative_probs > top_p
|
||||
# Shift the indices to the right to keep also the first token above the threshold
|
||||
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
||||
sorted_indices_to_remove[..., 0] = 0
|
||||
indices_to_remove = sorted_indices[sorted_indices_to_remove]
|
||||
logits[indices_to_remove] = filter_value
|
||||
#going back to 2D
|
||||
logits=logits.view(1, -1).contiguous()
|
||||
|
||||
return logits
|
||||
|
||||
|
||||
|
||||
class DistributedPlug:
|
||||
@classmethod
|
||||
def init(cls, rank, model_dir, model_config, args):
|
||||
#def init(cls, rank):
|
||||
#torch.backends.cudnn.enabled = False
|
||||
#
|
||||
cls.rank = rank
|
||||
cls.args = args
|
||||
cls.config = model_config
|
||||
cls.model_dir = model_dir
|
||||
initialize_distributed(rank)
|
||||
cls.set_random_seed(cls, args.seed)
|
||||
cls.setup_model(cls, path_load_tag='model')
|
||||
|
||||
def set_random_seed(cls, seed):
|
||||
if seed is not None and seed > 0:
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
mpu.model_parallel_cuda_manual_seed(seed)
|
||||
|
||||
def get_model(cls):
|
||||
"""Build the model."""
|
||||
|
||||
print_rank_0('Building Plug model. It will take a few minutes ...')
|
||||
model = PlugModel(cls.config)
|
||||
|
||||
if mpu.get_data_parallel_rank() == 0:
|
||||
print(' > number of parameters on model parallel rank {}: {}'.format(
|
||||
mpu.get_model_parallel_rank(),
|
||||
sum([p.nelement() for p in model.parameters()])), flush=True)
|
||||
|
||||
if cls.args.deepspeed and cls.args.fp16:
|
||||
model.half()
|
||||
|
||||
# GPU allocation.
|
||||
model.cuda(torch.cuda.current_device())
|
||||
|
||||
# Fp16 conversion.
|
||||
if cls.args.fp16:
|
||||
model = FP16_Module(model)
|
||||
if cls.args.fp32_embedding:
|
||||
model.module.model.bert.embeddings.word_embeddings.float()
|
||||
model.module.model.bert.embeddings.position_embeddings.float()
|
||||
model.module.model.bert.embeddings.token_type_embeddings.float()
|
||||
if cls.args.fp32_tokentypes:
|
||||
model.module.model.bert.embeddings.token_type_embeddings.float()
|
||||
if cls.args.fp32_layernorm:
|
||||
for name, _module in model.named_modules():
|
||||
if 'LayerNorm' in name:
|
||||
_module.float()
|
||||
|
||||
# model = DDP(model)
|
||||
|
||||
return model
|
||||
|
||||
def setup_model(cls, path_load_tag='model'):
|
||||
dist_model = cls.get_model(cls)
|
||||
if cls.model_dir is not None:
|
||||
from modelscope.utils.nlp.load_checkpoint import pre_load
|
||||
load_model = pre_load(mpu, cls.model_dir, tag=path_load_tag)
|
||||
# model_dict = dist_model.module.module.model.state_dict()
|
||||
model_dict = dist_model.module.model.state_dict()
|
||||
for key in load_model:
|
||||
if key not in model_dict.keys():
|
||||
print_rank_0('Skip key: '+key)
|
||||
else:
|
||||
print_rank_0('Loading key: '+key)
|
||||
# dist_model.module.module.model.load_state_dict(load_model, strict=False)
|
||||
dist_model.module.model.load_state_dict(load_model, strict=False)
|
||||
cls.args.iteration = 0
|
||||
cls.dist_model = dist_model
|
||||
|
||||
@classmethod
|
||||
def forward(cls, input:Dict[str, Tensor]):
|
||||
device = torch.cuda.current_device()
|
||||
tokens = input["input_ids"].to(device)
|
||||
dec_input_ids = input["dec_input_ids"].to(device)
|
||||
attention_mask = input["attention_mask"].to(device)
|
||||
cls.dist_model.eval()
|
||||
seq_length = 128
|
||||
with torch.no_grad():
|
||||
all_generate_tokens = []
|
||||
generate_tokens = []
|
||||
counter = 0
|
||||
sequence_output = None
|
||||
vocab_size = 21128
|
||||
#tokens, attention_mask, types, dec_input_ids = get_batch(context_tokens_tensor, device, args)
|
||||
while counter < seq_length:
|
||||
# if counter % 128 == 0 and counter != 0:
|
||||
# generate_tokens.append(tokenizer.vocab[args.sep_token])
|
||||
# start = (context_tokens_tensor == 102).nonzero(as_tuple=True)[-1]
|
||||
# if start + len(generate_tokens) >= 512:
|
||||
# context_tokens_tensor = torch.cat([context_tokens_tensor[:start], torch.cuda.LongTensor(generate_tokens)], -1)[-512:]
|
||||
# else:
|
||||
# context_tokens_tensor[start:start+len(generate_tokens)] = torch.cuda.LongTensor(generate_tokens)
|
||||
# tokens, attention_mask, types, dec_input_ids = get_batch(context_tokens_tensor, device, args)
|
||||
# generate_tokens = []
|
||||
# sequence_output = None
|
||||
|
||||
position_ids = torch.full([cls.args.batch_size, 1], len(generate_tokens), dtype=torch.long, device=device)
|
||||
_, logits, sequence_output = cls.dist_model(tokens, None, attention_mask, dec_input_ids, attention_mask, position_ids, is_infer=True, sequence_output=sequence_output, parallel_output=False)
|
||||
|
||||
partition_vocab_size = logits.size()[-1]
|
||||
|
||||
logits = logits[:, -1, :]
|
||||
logits = logits / cls.args.temperature
|
||||
logits = top_k_logits(logits, top_k=cls.args.top_k, top_p=cls.args.top_p)
|
||||
log_probs = F.softmax(logits, dim=-1)
|
||||
prev = torch.multinomial(log_probs, num_samples=1)
|
||||
prev_token = prev[0].item()
|
||||
if prev_token >= vocab_size: #or prev_token == 102:
|
||||
prev_token = 100
|
||||
prev[0] = 100
|
||||
# if prev_token == 102 and len(all_generate_tokens) > int(max(1, length) * 0.8):
|
||||
if prev_token == 102:
|
||||
break
|
||||
#if prev_token == 102:
|
||||
# counter += 1
|
||||
# continue
|
||||
#if prev_token == 100:
|
||||
# counter += 1
|
||||
# continue
|
||||
dec_input_ids = torch.cat([dec_input_ids, prev], dim=1)
|
||||
generate_tokens.append(prev_token)
|
||||
all_generate_tokens.append(prev_token)
|
||||
counter += 1
|
||||
|
||||
generate_context = []
|
||||
for token in all_generate_tokens:
|
||||
if generate_context and generate_context[-1] == 100 and token == 100:
|
||||
continue
|
||||
else:
|
||||
generate_context.append(token)
|
||||
return {"generate_context": generate_context}
|
||||
|
||||
1027
modelscope/models/nlp/plug/modeling_plug.py
Normal file
1027
modelscope/models/nlp/plug/modeling_plug.py
Normal file
File diff suppressed because it is too large
Load Diff
57
modelscope/models/nlp/plug/plug_for_text_generation.py
Normal file
57
modelscope/models/nlp/plug/plug_for_text_generation.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import torch
|
||||
from typing import Dict
|
||||
from functools import partial
|
||||
|
||||
from . import DistributedPlug
|
||||
from ...base import Tensor, TorchModel
|
||||
from ...builder import MODELS
|
||||
from ....metainfo import Models
|
||||
from ....outputs import OutputKeys
|
||||
from ....utils.constant import Tasks
|
||||
|
||||
__all__ = ['PLUGForTextGeneration']
|
||||
|
||||
@MODELS.register_module(Tasks.text_generation, module_name=Models.plug)
|
||||
class PlugForTextGeneration(TorchModel):
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
import torch
|
||||
|
||||
from transformers import BertTokenizer
|
||||
from multiprocessing import Pool
|
||||
from .arguments import get_args
|
||||
from . import PlugNLGConfig
|
||||
|
||||
self.tokenizer = BertTokenizer.from_pretrained(model_dir)
|
||||
model_config = PlugNLGConfig.from_pretrained(model_dir)
|
||||
|
||||
# TODO(suluyan): Arguments
|
||||
args = get_args()
|
||||
args.world_size = 8
|
||||
args.model_parallel_size = 8
|
||||
args.pre_load = True
|
||||
args.distributed_backend = 'nccl'
|
||||
args.fp16 = True
|
||||
args.fp32_layernorm = True
|
||||
args.checkpoint_activations = True
|
||||
args.batch_size = 1
|
||||
args.top_k = 20
|
||||
args.top_p = 0.0
|
||||
args.temperature = 0.9
|
||||
self.args = args
|
||||
|
||||
self.world_size = args.world_size
|
||||
ranks = list(range(self.world_size))
|
||||
self.model_pool = Pool(self.world_size)
|
||||
self.model_pool.map(partial(DistributedPlug.init, model_dir=model_dir, model_config=model_config, args=args), ranks)
|
||||
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
return self.model(**input)
|
||||
|
||||
def generate(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
dec_input_ids = torch.full([self.args.batch_size, 1], self.tokenizer.cls_token_id, dtype=torch.long)
|
||||
input["dec_input_ids"] = dec_input_ids
|
||||
res = self.model_pool.map(DistributedPlug.forward, [input]*self.world_size)
|
||||
return res[0]
|
||||
|
||||
|
||||
@@ -153,7 +153,7 @@ class Pipeline(ABC):
|
||||
if self.device_name == 'gpu':
|
||||
device = create_device()
|
||||
if device.type == 'gpu':
|
||||
torch.cuda.set_device(device)
|
||||
pass #torch.cuda.set_device(device)
|
||||
yield
|
||||
else:
|
||||
yield
|
||||
|
||||
@@ -8,6 +8,7 @@ from modelscope.pipelines.base import Pipeline, Tensor
|
||||
from modelscope.pipelines.builder import PIPELINES
|
||||
from modelscope.preprocessors import TextGenerationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.outputs import OutputKeys
|
||||
|
||||
__all__ = ['TextGenerationPipeline']
|
||||
|
||||
@@ -56,6 +57,7 @@ class TextGenerationPipeline(Pipeline):
|
||||
sequence_length=kwargs.pop('sequence_length', 128))
|
||||
model.eval()
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
self.tokenizer = preprocessor.tokenizer
|
||||
|
||||
def forward(self, inputs: Dict[str, Any],
|
||||
**forward_params) -> Dict[str, Any]:
|
||||
@@ -72,4 +74,6 @@ class TextGenerationPipeline(Pipeline):
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
return inputs
|
||||
generate_context = inputs["generate_context"]
|
||||
generate_context = "".join(self.tokenizer.convert_ids_to_tokens(generate_context)).replace('[UNK]', '“').replace('##','')
|
||||
return {OutputKeys.TEXT: generate_context}
|
||||
|
||||
@@ -161,7 +161,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
|
||||
"""
|
||||
|
||||
model_type = get_model_type(model_dir)
|
||||
if model_type in (Models.structbert, Models.gpt3, Models.palm):
|
||||
if model_type in (Models.structbert, Models.gpt3, Models.palm, Models.plug):
|
||||
from modelscope.models.nlp.structbert import SbertTokenizer
|
||||
return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
|
||||
elif model_type == Models.veco:
|
||||
|
||||
109
modelscope/utils/nlp/distributed.py
Executable file
109
modelscope/utils/nlp/distributed.py
Executable file
@@ -0,0 +1,109 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
import torch.distributed as dist
|
||||
from torch.nn.modules import Module
|
||||
from torch.autograd import Variable
|
||||
from sofa.utils import mpu
|
||||
|
||||
class DistributedDataParallel(Module):
|
||||
|
||||
def __init__(self, module):
|
||||
super(DistributedDataParallel, self).__init__()
|
||||
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
||||
|
||||
self.module = module
|
||||
self.data_parallel_group = mpu.get_data_parallel_group()
|
||||
src_rank = mpu.get_model_parallel_rank()
|
||||
for p in self.module.parameters():
|
||||
if torch.is_tensor(p):
|
||||
dist.broadcast(p, src_rank, group=self.data_parallel_group)
|
||||
|
||||
def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False):
|
||||
if(self.needs_reduction):
|
||||
self.needs_reduction = False
|
||||
buckets = {}
|
||||
for name, param in self.module.named_parameters():
|
||||
if param.requires_grad and param.grad is not None:
|
||||
tp = (param.data.type())
|
||||
if tp not in buckets:
|
||||
buckets[tp] = []
|
||||
buckets[tp].append(param)
|
||||
if self.warn_on_half:
|
||||
if torch.cuda.HalfTensor in buckets:
|
||||
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
||||
" It is recommended to use the NCCL backend in this case.")
|
||||
self.warn_on_half = False
|
||||
for tp in buckets:
|
||||
bucket = buckets[tp]
|
||||
grads = [param.grad.data for param in bucket]
|
||||
coalesced = _flatten_dense_tensors(grads)
|
||||
if fp32_allreduce:
|
||||
coalesced = coalesced.float()
|
||||
if not no_scale and not reduce_after:
|
||||
coalesced /= dist.get_world_size(group=self.data_parallel_group)
|
||||
dist.all_reduce(coalesced, group=self.data_parallel_group)
|
||||
torch.cuda.synchronize()
|
||||
if not no_scale and reduce_after:
|
||||
coalesced /= dist.get_world_size(group=self.data_parallel_group)
|
||||
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
||||
buf.copy_(synced)
|
||||
self.hook_handles = []
|
||||
self.hooks = []
|
||||
for param in list(self.module.parameters()):
|
||||
def allreduce_hook(*unused):
|
||||
Variable._execution_engine.queue_callback(allreduce_params)
|
||||
# handle = param.register_hook(allreduce_hook)
|
||||
#self.hooks.append(allreduce_hook)
|
||||
#self.hook_handles.append(handle)
|
||||
self.allreduce_params = allreduce_params
|
||||
|
||||
def forward(self, *inputs, **kwargs):
|
||||
self.needs_reduction = True
|
||||
return self.module(*inputs, **kwargs)
|
||||
|
||||
def state_dict(self, destination=None, prefix='', keep_vars=False):
|
||||
#[h.remove() for h in self.hook_handles]
|
||||
sd = self.module.state_dict(destination, prefix, keep_vars)
|
||||
# for handle, hook in zip(self.hook_handles, self.hooks):
|
||||
# d = handle.hooks_dict_ref()
|
||||
# d[handle.id] = hook
|
||||
|
||||
return sd
|
||||
|
||||
def load_state_dict(self, state_dict, strict=True):
|
||||
self.module.load_state_dict(state_dict, strict=strict)
|
||||
|
||||
'''
|
||||
def _sync_buffers(self):
|
||||
buffers = list(self.module._all_buffers())
|
||||
if len(buffers) > 0:
|
||||
# cross-node buffer sync
|
||||
flat_buffers = _flatten_dense_tensors(buffers)
|
||||
dist.broadcast(flat_buffers, 0)
|
||||
for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
|
||||
buf.copy_(synced)
|
||||
def train(self, mode=True):
|
||||
# Clear NCCL communicator and CUDA event cache of the default group ID,
|
||||
# These cache will be recreated at the later call. This is currently a
|
||||
# work-around for a potential NCCL deadlock.
|
||||
if dist._backend == dist.dist_backend.NCCL:
|
||||
dist._clear_group_cache()
|
||||
super(DistributedDataParallel, self).train(mode)
|
||||
self.module.train(mode)
|
||||
'''
|
||||
|
||||
30
modelscope/utils/nlp/fp16/__init__.py
Executable file
30
modelscope/utils/nlp/fp16/__init__.py
Executable file
@@ -0,0 +1,30 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .fp16util import (
|
||||
BN_convert_float,
|
||||
network_to_half,
|
||||
prep_param_lists,
|
||||
model_grads_to_master_grads,
|
||||
master_params_to_model_params,
|
||||
tofp16,
|
||||
to_python_float,
|
||||
clip_grad_norm,
|
||||
convert_module,
|
||||
convert_network,
|
||||
FP16Model,
|
||||
)
|
||||
|
||||
from .fp16 import *
|
||||
from .loss_scaler import *
|
||||
629
modelscope/utils/nlp/fp16/fp16.py
Executable file
629
modelscope/utils/nlp/fp16/fp16.py
Executable file
@@ -0,0 +1,629 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Stable version of apex FP16 Optimizer"""
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.autograd import Variable
|
||||
from torch.nn.parameter import Parameter
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
|
||||
from .loss_scaler import DynamicLossScaler, LossScaler
|
||||
from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
|
||||
|
||||
FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
|
||||
HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
|
||||
|
||||
def conversion_helper(val, conversion):
|
||||
"""Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
|
||||
if not isinstance(val, (tuple, list)):
|
||||
return conversion(val)
|
||||
rtn = [conversion_helper(v, conversion) for v in val]
|
||||
if isinstance(val, tuple):
|
||||
rtn = tuple(rtn)
|
||||
return rtn
|
||||
|
||||
def fp32_to_fp16(val):
|
||||
"""Convert fp32 `val` to fp16"""
|
||||
def half_conversion(val):
|
||||
val_typecheck = val
|
||||
if isinstance(val_typecheck, (Parameter, Variable)):
|
||||
val_typecheck = val.data
|
||||
if isinstance(val_typecheck, FLOAT_TYPES):
|
||||
val = val.half()
|
||||
return val
|
||||
return conversion_helper(val, half_conversion)
|
||||
|
||||
def fp16_to_fp32(val):
|
||||
"""Convert fp16 `val` to fp32"""
|
||||
def float_conversion(val):
|
||||
val_typecheck = val
|
||||
if isinstance(val_typecheck, (Parameter, Variable)):
|
||||
val_typecheck = val.data
|
||||
if isinstance(val_typecheck, HALF_TYPES):
|
||||
val = val.float()
|
||||
return val
|
||||
return conversion_helper(val, float_conversion)
|
||||
|
||||
class FP16_Module(nn.Module):
|
||||
def __init__(self, module):
|
||||
super(FP16_Module, self).__init__()
|
||||
self.add_module('module', module.half())
|
||||
|
||||
def forward(self, *inputs, **kwargs):
|
||||
return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
|
||||
|
||||
def state_dict(self, destination=None, prefix='', keep_vars=False):
|
||||
return self.module.state_dict(destination, prefix, keep_vars)
|
||||
|
||||
def load_state_dict(self, state_dict, strict=True):
|
||||
self.module.load_state_dict(state_dict, strict=strict)
|
||||
|
||||
# TODO: Update overflow check + downscale to use Carl's fused kernel.
|
||||
class FP16_Optimizer(object):
|
||||
"""
|
||||
:class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
|
||||
and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
|
||||
For standard use, only two lines must be changed: creating the :class:`FP16_Optimizer` instance,
|
||||
and changing the call to ``backward``.
|
||||
|
||||
Example::
|
||||
|
||||
model = torch.nn.Linear(D_in, D_out).cuda().half()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
|
||||
# Name the FP16_Optimizer instance to replace the existing optimizer
|
||||
# (recommended but not required):
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
|
||||
...
|
||||
# loss.backward() becomes:
|
||||
optimizer.backward(loss)
|
||||
...
|
||||
|
||||
Example with dynamic loss scaling::
|
||||
|
||||
...
|
||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
# optional arg to control dynamic loss scaling behavior
|
||||
# dynamic_loss_args={'scale_window' : 500})
|
||||
# Usually, dynamic_loss_args is not necessary.
|
||||
|
||||
Args:
|
||||
init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
|
||||
static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
|
||||
dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option.
|
||||
dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
|
||||
verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
|
||||
|
||||
``init_optimizer`` is expected to have been constructed in the ordinary way.
|
||||
It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
|
||||
named to replace ``init_optimizer``, for two reasons:
|
||||
First, it means that references to the same name
|
||||
later in the file will not have to change.
|
||||
Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
|
||||
modify ``init_optimizer``. If you do choose a unique name for the new
|
||||
:class:`FP16_Optimizer` instance, you should only work with this new instance,
|
||||
because the preexisting optimizer might no longer behave as expected.
|
||||
|
||||
``init_optimizer`` may be any Pytorch optimizer.
|
||||
It may contain a mixture of fp16 and fp32 parameters organized into any number of
|
||||
``param_groups`` with different hyperparameters. The :class:`FP16_Optimizer` constructor will
|
||||
ingest these ``param_groups`` and remember them.
|
||||
|
||||
Calls to ::
|
||||
|
||||
loss.backward()
|
||||
|
||||
must be replaced with ::
|
||||
|
||||
optimizer.backward(loss)
|
||||
|
||||
because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
|
||||
loss scaling and copies to master gradients.
|
||||
|
||||
.. note::
|
||||
Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
|
||||
are downscaled before being applied. This means that adjusting the loss scale, or using
|
||||
dynamic loss scaling, should not require retuning the learning rate or any other
|
||||
hyperparameters.
|
||||
|
||||
|
||||
**Advanced options**
|
||||
|
||||
**Closures**: :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
|
||||
See docstring for :attr:`step`.
|
||||
|
||||
**Gradient clipping**: Use :attr:`clip_master_grads`.
|
||||
|
||||
**Multiple losses**: If your model accumulates gradients from multiple losses,
|
||||
this can be made more efficient by supplying ``update_master_grads=False``
|
||||
to :attr:`backward`. See docstring for :attr:`backward`.
|
||||
|
||||
**Manually adjusting loss scale**: The current loss scale can be retrieved or set via ::
|
||||
|
||||
print(optimizer.loss_scale)
|
||||
optimizer.loss_scale = new_loss_scale
|
||||
|
||||
For static loss scaling, manually adjusting the loss scale over time is a reasonable
|
||||
thing to do. During later epochs, gradients may become smaller, and a
|
||||
higher loss scale may be required, analogous to scheduling the learning rate. Dynamic loss
|
||||
scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
|
||||
the loss scale is not recommended.
|
||||
|
||||
**Multi_GPU training**: If the wrapped ``init_optimizer`` was created from a model wrapped in
|
||||
Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
|
||||
should still work as intended.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
init_optimizer,
|
||||
static_loss_scale=1.0,
|
||||
dynamic_loss_scale=False,
|
||||
dynamic_loss_args=None,
|
||||
verbose=False):
|
||||
if not torch.cuda.is_available:
|
||||
raise SystemError("Cannot use fp16 without CUDA.")
|
||||
|
||||
self.verbose = verbose
|
||||
|
||||
self.optimizer = init_optimizer
|
||||
# init_state_dict sets up an alternative way to cast per-param state tensors.
|
||||
# Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
|
||||
# init_state_dict = init_optimizer.state_dict()
|
||||
|
||||
self.fp16_groups = []
|
||||
self.fp32_from_fp16_groups = []
|
||||
self.fp32_from_fp32_groups = []
|
||||
for i, param_group in enumerate(self.optimizer.param_groups):
|
||||
self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
|
||||
fp16_params_this_group = []
|
||||
fp32_params_this_group = []
|
||||
fp32_from_fp16_params_this_group = []
|
||||
for i, param in enumerate(param_group['params']):
|
||||
if param.requires_grad:
|
||||
if param.type() == 'torch.cuda.HalfTensor':
|
||||
self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
|
||||
.format(param.size()))
|
||||
fp16_params_this_group.append(param)
|
||||
master_param = param.detach().clone().float()
|
||||
master_param.requires_grad = True
|
||||
# Copythe model parallel flag.
|
||||
master_param.model_parallel = param.model_parallel
|
||||
param_group['params'][i] = master_param
|
||||
fp32_from_fp16_params_this_group.append(master_param)
|
||||
# Reset existing state dict key to the new master param.
|
||||
# We still need to recast per-param state tensors, if any, to FP32.
|
||||
if param in self.optimizer.state:
|
||||
self.optimizer.state[master_param] = self.optimizer.state.pop(param)
|
||||
elif param.type() == 'torch.cuda.FloatTensor':
|
||||
self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
|
||||
.format(param.size()))
|
||||
fp32_params_this_group.append(param)
|
||||
param_group['params'][i] = param
|
||||
else:
|
||||
raise TypeError("Wrapped parameters must be either "
|
||||
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
|
||||
"Received {}".format(param.type()))
|
||||
|
||||
self.fp16_groups.append(fp16_params_this_group)
|
||||
self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
|
||||
self.fp32_from_fp32_groups.append(fp32_params_this_group)
|
||||
|
||||
# Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
|
||||
self.optimizer.load_state_dict(self.optimizer.state_dict())
|
||||
# alternative way to cast per-param state tensors:
|
||||
# self.optimizer.load_state_dict(init_state_dict)
|
||||
|
||||
if dynamic_loss_scale:
|
||||
self.dynamic_loss_scale = True
|
||||
if dynamic_loss_args is not None:
|
||||
self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
|
||||
else:
|
||||
self.loss_scaler = DynamicLossScaler()
|
||||
else:
|
||||
self.dynamic_loss_scale = False
|
||||
self.loss_scaler = LossScaler(static_loss_scale)
|
||||
|
||||
self.overflow = False
|
||||
self.first_closure_call_this_step = True
|
||||
|
||||
self.clip_grad_norm = clip_grad_norm
|
||||
|
||||
def maybe_print(self, msg):
|
||||
if self.verbose:
|
||||
print(msg)
|
||||
|
||||
def __getstate__(self):
|
||||
raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
|
||||
|
||||
def __setstate__(self, state):
|
||||
raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")
|
||||
|
||||
def zero_grad(self, set_grads_to_None=False):
|
||||
"""
|
||||
Zero fp32 and fp16 parameter grads.
|
||||
"""
|
||||
# In principle, only the .grad attributes of the model params need to be zeroed,
|
||||
# because gradients are copied into the FP32 master params. However, we zero
|
||||
# all gradients owned by the optimizer, just to be safe:
|
||||
for group in self.optimizer.param_groups:
|
||||
for p in group['params']:
|
||||
if set_grads_to_None:
|
||||
p.grad = None
|
||||
else:
|
||||
if p.grad is not None:
|
||||
p.grad.detach_()
|
||||
p.grad.zero_()
|
||||
|
||||
# Zero fp16 gradients owned by the model:
|
||||
for fp16_group in self.fp16_groups:
|
||||
for param in fp16_group:
|
||||
if set_grads_to_None:
|
||||
param.grad = None
|
||||
else:
|
||||
if param.grad is not None:
|
||||
param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
|
||||
param.grad.zero_()
|
||||
|
||||
def _check_overflow(self):
|
||||
params = []
|
||||
for group in self.fp16_groups:
|
||||
for param in group:
|
||||
params.append(param)
|
||||
for group in self.fp32_from_fp32_groups:
|
||||
for param in group:
|
||||
params.append(param)
|
||||
self.overflow = self.loss_scaler.has_overflow(params)
|
||||
|
||||
def _update_scale(self, has_overflow=False):
|
||||
self.loss_scaler.update_scale(has_overflow)
|
||||
|
||||
def _master_params_to_model_params(self):
|
||||
for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
|
||||
master_params_to_model_params(fp16_group, fp32_from_fp16_group)
|
||||
|
||||
def _model_params_to_master_params(self):
|
||||
for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
|
||||
master_params_to_model_params(fp32_from_fp16_group, fp16_group)
|
||||
|
||||
# To consider: Integrate distributed with this wrapper by registering a hook on each variable
|
||||
# that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
|
||||
def _model_grads_to_master_grads(self):
|
||||
for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
|
||||
model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
|
||||
|
||||
def _downscale_master(self):
|
||||
if self.loss_scale != 1.0:
|
||||
for group in self.optimizer.param_groups:
|
||||
for param in group['params']:
|
||||
if param.grad is not None:
|
||||
param.grad.data.mul_(1./self.loss_scale)
|
||||
|
||||
def clip_master_grads(self, max_norm, norm_type=2):
|
||||
"""
|
||||
Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
|
||||
|
||||
Args:
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
|
||||
infinity norm.
|
||||
|
||||
Returns:
|
||||
Total norm of the current fp32 gradients (viewed as a single vector).
|
||||
|
||||
.. warning::
|
||||
Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
|
||||
"""
|
||||
if not self.overflow:
|
||||
fp32_params = []
|
||||
for param_group in self.optimizer.param_groups:
|
||||
for param in param_group['params']:
|
||||
fp32_params.append(param)
|
||||
return self.clip_grad_norm(fp32_params, max_norm, norm_type)
|
||||
else:
|
||||
return -1
|
||||
|
||||
def state_dict(self):
|
||||
"""
|
||||
Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
|
||||
This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
|
||||
of the contained Pytorch optimizer.
|
||||
Example::
|
||||
|
||||
checkpoint = {}
|
||||
checkpoint['model'] = model.state_dict()
|
||||
checkpoint['optimizer'] = optimizer.state_dict()
|
||||
torch.save(checkpoint, "saved.pth")
|
||||
"""
|
||||
state_dict = {}
|
||||
state_dict['loss_scaler'] = self.loss_scaler
|
||||
state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
|
||||
state_dict['overflow'] = self.overflow
|
||||
state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
|
||||
state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
|
||||
state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
|
||||
return state_dict
|
||||
|
||||
def load_state_dict(self, state_dict):
|
||||
"""
|
||||
Loads a state_dict created by an earlier call to state_dict().
|
||||
If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
|
||||
whose parameters in turn came from ``model``, it is expected that the user
|
||||
will call ``model.load_state_dict()`` before
|
||||
``fp16_optimizer_instance.load_state_dict()`` is called.
|
||||
|
||||
Example::
|
||||
|
||||
model = torch.nn.Linear(D_in, D_out).cuda().half()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
|
||||
...
|
||||
checkpoint = torch.load("saved.pth")
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
"""
|
||||
# I think it should actually be ok to reload the optimizer before the model.
|
||||
self.loss_scaler = state_dict['loss_scaler']
|
||||
self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
|
||||
self.overflow = state_dict['overflow']
|
||||
self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
|
||||
self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
|
||||
# At this point, the optimizer's references to the model's fp32 parameters are up to date.
|
||||
# The optimizer's hyperparameters and internal buffers are also up to date.
|
||||
# However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
|
||||
# out of date. There are two options.
|
||||
# 1: Refresh the master params from the model's fp16 params.
|
||||
# This requires less storage but incurs precision loss.
|
||||
# 2: Save and restore the fp32 master copies separately.
|
||||
# We choose option 2.
|
||||
#
|
||||
# Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
|
||||
# of their associated parameters, because it's possible those buffers might not exist yet in
|
||||
# the current optimizer instance. In our case, as long as the current FP16_Optimizer has been
|
||||
# constructed in the same way as the one whose state_dict we are loading, the same master params
|
||||
# are guaranteed to exist, so we can just copy_() from the saved master params.
|
||||
for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
|
||||
for current, saved in zip(current_group, saved_group):
|
||||
current.data.copy_(saved.data)
|
||||
|
||||
def step(self, closure=None): # could add clip option.
|
||||
"""
|
||||
If no closure is supplied, :attr:`step` should be called after
|
||||
``fp16_optimizer_obj.backward(loss)``.
|
||||
:attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
|
||||
:class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
|
||||
originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
|
||||
another forward pass using their model.
|
||||
|
||||
If a closure is supplied, :attr:`step` may be called without a prior call to
|
||||
:attr:`backward(loss)`.
|
||||
This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
|
||||
However, the user should take care that any ``loss.backward()`` call within the closure
|
||||
has been replaced by ``fp16_optimizer_obj.backward(loss)``.
|
||||
|
||||
Args:
|
||||
closure (optional): Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor. closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
|
||||
|
||||
Example with closure::
|
||||
|
||||
# optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
|
||||
# existing pytorch optimizer.
|
||||
for input, target in dataset:
|
||||
def closure():
|
||||
optimizer.zero_grad()
|
||||
output = model(input)
|
||||
loss = loss_fn(output, target)
|
||||
# loss.backward() becomes:
|
||||
optimizer.backward(loss)
|
||||
return loss
|
||||
optimizer.step(closure)
|
||||
|
||||
.. warning::
|
||||
Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
|
||||
|
||||
.. _`ordinary Pytorch optimizer use`:
|
||||
http://pytorch.org/docs/master/optim.html#optimizer-step-closure
|
||||
"""
|
||||
|
||||
scale = self.loss_scaler.loss_scale
|
||||
self._update_scale(self.overflow)
|
||||
|
||||
if self.overflow:
|
||||
self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}"
|
||||
.format(scale, self.loss_scale))
|
||||
return
|
||||
|
||||
if closure is not None:
|
||||
retval = self._step_with_closure(closure)
|
||||
else:
|
||||
retval = self.optimizer.step()
|
||||
|
||||
self._master_params_to_model_params()
|
||||
|
||||
return retval
|
||||
|
||||
def _step_with_closure(self, closure):
|
||||
def wrapped_closure():
|
||||
# helpful for debugging
|
||||
# print("Calling wrapped_closure, first_closure_call_this_step = {}"
|
||||
# .format(self.first_closure_call_this_step))
|
||||
if self.first_closure_call_this_step:
|
||||
# We expect that the fp16 params are initially fresh on entering self.step(),
|
||||
# so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
|
||||
# is called within self.optimizer.step().
|
||||
self.first_closure_call_this_step = False
|
||||
else:
|
||||
# If self.optimizer.step() internally calls wrapped_closure more than once,
|
||||
# it may update the fp32 params after each call. However, self.optimizer
|
||||
# doesn't know about the fp16 params at all. If the fp32 params get updated,
|
||||
# we can't rely on self.optimizer to refresh the fp16 params. We need
|
||||
# to handle that manually:
|
||||
self._master_params_to_model_params()
|
||||
# Our API expects the user to give us ownership of the backward() call by
|
||||
# replacing all calls to loss.backward() with optimizer.backward(loss).
|
||||
# This requirement holds whether or not the call to backward() is made within a closure.
|
||||
# If the user is properly calling optimizer.backward(loss) within "closure,"
|
||||
# calling closure() here will give the fp32 master params fresh gradients
|
||||
# for the optimizer to play with, so all wrapped_closure needs to do is call
|
||||
# closure() and return the loss.
|
||||
temp_loss = closure()
|
||||
while(self.overflow):
|
||||
scale = self.loss_scaler.loss_scale
|
||||
self._update_scale(self.overflow)
|
||||
self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, "
|
||||
"reducing to {}".format(scale, self.loss_scale))
|
||||
temp_loss = closure()
|
||||
return temp_loss
|
||||
|
||||
retval = self.optimizer.step(wrapped_closure)
|
||||
|
||||
self.first_closure_call_this_step = True
|
||||
|
||||
return retval
|
||||
|
||||
def backward(self, loss, update_master_grads=True, retain_graph=False):
|
||||
"""
|
||||
:attr:`backward` performs the following conceptual steps:
|
||||
|
||||
1. fp32_loss = loss.float() (see first Note below)
|
||||
2. scaled_loss = fp32_loss*loss_scale
|
||||
3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
|
||||
4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
|
||||
5. Finally, master grads are divided by loss_scale.
|
||||
|
||||
In this way, after :attr:`backward`, the master params have fresh gradients,
|
||||
and :attr:`step` may be called.
|
||||
|
||||
.. note::
|
||||
:attr:`backward` internally converts the loss to fp32 before applying the loss scale.
|
||||
This provides some additional safety against overflow if the user has supplied an
|
||||
fp16 loss value.
|
||||
However, for maximum overflow safety, the user should
|
||||
compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
|
||||
:attr:`backward`.
|
||||
|
||||
.. warning::
|
||||
The gradients found in a model's leaves after the call to
|
||||
:attr:`backward` should not be regarded as valid in general,
|
||||
because it's possible
|
||||
they have been scaled (and in the case of dynamic loss scaling,
|
||||
the scale factor may change over time).
|
||||
If the user wants to inspect gradients after a call to :attr:`backward`,
|
||||
only the master gradients should be regarded as valid. These can be retrieved via
|
||||
:attr:`inspect_master_grad_data()`.
|
||||
|
||||
Args:
|
||||
loss: The loss output by the user's model. loss may be either float or half (but see first Note above).
|
||||
update_master_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
|
||||
retain_graph (bool, optional, default=False): Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``. If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
|
||||
|
||||
Example::
|
||||
|
||||
# Ordinary operation:
|
||||
optimizer.backward(loss)
|
||||
|
||||
# Naive operation with multiple losses (technically valid, but less efficient):
|
||||
# fp32 grads will be correct after the second call, but
|
||||
# the first call incurs an unnecessary fp16->fp32 grad copy.
|
||||
optimizer.backward(loss1)
|
||||
optimizer.backward(loss2)
|
||||
|
||||
# More efficient way to handle multiple losses:
|
||||
# The fp16->fp32 grad copy is delayed until fp16 grads from all
|
||||
# losses have been accumulated.
|
||||
optimizer.backward(loss1, update_master_grads=False)
|
||||
optimizer.backward(loss2, update_master_grads=False)
|
||||
optimizer.update_master_grads()
|
||||
"""
|
||||
# To consider: try multiple backward passes using retain_grad=True to find
|
||||
# a loss scale that works. After you find a loss scale that works, do a final dummy
|
||||
# backward pass with retain_graph=False to tear down the graph. Doing this would avoid
|
||||
# discarding the iteration, but probably wouldn't improve overall efficiency.
|
||||
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
|
||||
if update_master_grads:
|
||||
self.update_master_grads()
|
||||
|
||||
def update_master_grads(self):
|
||||
"""
|
||||
Copy the ``.grad`` attribute from stored references to fp16 parameters to
|
||||
the ``.grad`` attribute of the fp32 master parameters that are directly
|
||||
updated by the optimizer. :attr:`update_master_grads` only needs to be called if
|
||||
``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
|
||||
"""
|
||||
if self.dynamic_loss_scale:
|
||||
self._check_overflow()
|
||||
if self.overflow: return
|
||||
self._model_grads_to_master_grads()
|
||||
self._downscale_master()
|
||||
|
||||
def inspect_master_grad_data(self):
|
||||
"""
|
||||
When running with :class:`FP16_Optimizer`,
|
||||
``.grad`` attributes of a model's fp16 leaves should not be
|
||||
regarded as truthful, because they might be scaled.
|
||||
After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
|
||||
the fp32 master params' ``.grad``
|
||||
attributes will contain valid gradients properly divided by the loss scale. However,
|
||||
because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
|
||||
nonintuitive. :attr:`inspect_master_grad_data`
|
||||
allows those gradients to be viewed with shapes corresponding to their associated model leaves.
|
||||
|
||||
Returns:
|
||||
List of lists (one list for each parameter group). The list for each parameter group
|
||||
is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
|
||||
"""
|
||||
if self.overflow:
|
||||
print("Warning: calling FP16_Optimizer.inspect_master_grad_data while in an overflow state. "
|
||||
"Gradients are currently invalid (may be inf, nan, or stale). Returning None.")
|
||||
return None
|
||||
else:
|
||||
# The optimizer owns only references to master params.
|
||||
master_grads_data = []
|
||||
for param_group in self.optimizer.param_groups:
|
||||
master_grads_this_group = []
|
||||
for param in param_group['params']:
|
||||
if param.grad is not None:
|
||||
master_grads_this_group.append(param.grad.data)
|
||||
else:
|
||||
master_grads_this_group.append(None)
|
||||
master_grads_data.append(master_grads_this_group)
|
||||
return master_grads_data
|
||||
|
||||
|
||||
# Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
|
||||
def _get_loss_scale(self):
|
||||
return self.loss_scaler.loss_scale
|
||||
|
||||
def _set_loss_scale(self, value):
|
||||
self.loss_scaler.cur_scale = value
|
||||
|
||||
loss_scale = property(_get_loss_scale, _set_loss_scale)
|
||||
|
||||
# Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
|
||||
def _get_state(self):
|
||||
return self.optimizer.state
|
||||
|
||||
def _set_state(self, value):
|
||||
self.optimizer.state = value
|
||||
|
||||
state = property(_get_state, _set_state)
|
||||
|
||||
# Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
|
||||
# (for example, to adjust the learning rate)
|
||||
def _get_param_groups(self):
|
||||
return self.optimizer.param_groups
|
||||
|
||||
def _set_param_groups(self, value):
|
||||
self.optimizer.param_groups = value
|
||||
|
||||
param_groups = property(_get_param_groups, _set_param_groups)
|
||||
204
modelscope/utils/nlp/fp16/fp16util.py
Executable file
204
modelscope/utils/nlp/fp16/fp16util.py
Executable file
@@ -0,0 +1,204 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.autograd import Variable
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
|
||||
from sofa.utils import mpu
|
||||
|
||||
|
||||
class tofp16(nn.Module):
|
||||
"""
|
||||
Utility module that implements::
|
||||
|
||||
def forward(self, input):
|
||||
return input.half()
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(tofp16, self).__init__()
|
||||
|
||||
def forward(self, input):
|
||||
return input.half()
|
||||
|
||||
|
||||
def BN_convert_float(module):
|
||||
"""
|
||||
Utility function for network_to_half().
|
||||
|
||||
Retained for legacy purposes.
|
||||
"""
|
||||
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
|
||||
module.float()
|
||||
for child in module.children():
|
||||
BN_convert_float(child)
|
||||
return module
|
||||
|
||||
|
||||
def network_to_half(network):
|
||||
"""
|
||||
Convert model to half precision in a batchnorm-safe way.
|
||||
|
||||
Retained for legacy purposes. It is recommended to use FP16Model.
|
||||
"""
|
||||
return nn.Sequential(tofp16(), BN_convert_float(network.half()))
|
||||
|
||||
|
||||
def convert_module(module, dtype):
|
||||
"""
|
||||
Converts a module's immediate parameters and buffers to dtype.
|
||||
"""
|
||||
for param in module.parameters(recurse=False):
|
||||
if param is not None:
|
||||
if param.data.dtype.is_floating_point:
|
||||
param.data = param.data.to(dtype=dtype)
|
||||
if param._grad is not None and param._grad.data.dtype.is_floating_point:
|
||||
param._grad.data = param._grad.data.to(dtype=dtype)
|
||||
|
||||
for buf in module.buffers(recurse=False):
|
||||
if buf is not None and buf.data.dtype.is_floating_point:
|
||||
buf.data = buf.data.to(dtype=dtype)
|
||||
|
||||
|
||||
def convert_network(network, dtype):
|
||||
"""
|
||||
Converts a network's parameters and buffers to dtype.
|
||||
"""
|
||||
for module in network.modules():
|
||||
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
|
||||
continue
|
||||
convert_module(module, dtype)
|
||||
return network
|
||||
|
||||
|
||||
class FP16Model(nn.Module):
|
||||
"""
|
||||
Convert model to half precision in a batchnorm-safe way.
|
||||
"""
|
||||
|
||||
def __init__(self, network):
|
||||
super(FP16Model, self).__init__()
|
||||
self.network = convert_network(network, dtype=torch.half)
|
||||
|
||||
def forward(self, *inputs):
|
||||
inputs = tuple(t.half() for t in inputs)
|
||||
return self.network(*inputs)
|
||||
|
||||
|
||||
def backwards_debug_hook(grad):
|
||||
raise RuntimeError("master_params recieved a gradient in the backward pass!")
|
||||
|
||||
def prep_param_lists(model, flat_master=False):
|
||||
"""
|
||||
Creates a list of FP32 master parameters for a given model, as in
|
||||
`Training Neural Networks with Mixed Precision: Real Examples`_.
|
||||
|
||||
Args:
|
||||
model (torch.nn.Module): Existing Pytorch model
|
||||
flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization.
|
||||
Returns:
|
||||
A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element.
|
||||
|
||||
Example::
|
||||
|
||||
model_params, master_params = prep_param_lists(model)
|
||||
|
||||
.. warning::
|
||||
Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
|
||||
|
||||
.. _`Training Neural Networks with Mixed Precision: Real Examples`:
|
||||
http://on-demand.gputechconf.com/gtc/2018/video/S81012/
|
||||
"""
|
||||
model_params = [param for param in model.parameters() if param.requires_grad]
|
||||
|
||||
if flat_master:
|
||||
# Give the user some more useful error messages
|
||||
try:
|
||||
# flatten_dense_tensors returns a contiguous flat array.
|
||||
# http://pytorch.org/docs/master/_modules/torch/_utils.html
|
||||
master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
|
||||
except:
|
||||
print("Error in prep_param_lists: model may contain a mixture of parameters "
|
||||
"of different types. Use flat_master=False, or use F16_Optimizer.")
|
||||
raise
|
||||
master_params = torch.nn.Parameter(master_params)
|
||||
master_params.requires_grad = True
|
||||
# master_params.register_hook(backwards_debug_hook)
|
||||
if master_params.grad is None:
|
||||
master_params.grad = master_params.new(*master_params.size())
|
||||
return model_params, [master_params]
|
||||
else:
|
||||
master_params = [param.clone().float().detach() for param in model_params]
|
||||
for param in master_params:
|
||||
param.requires_grad = True
|
||||
return model_params, master_params
|
||||
|
||||
|
||||
def model_grads_to_master_grads(model_params, master_params, flat_master=False):
|
||||
"""
|
||||
Copy model gradients to master gradients.
|
||||
|
||||
Args:
|
||||
model_params: List of model parameters created by :func:`prep_param_lists`.
|
||||
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
|
||||
"""
|
||||
if flat_master:
|
||||
# The flattening may incur one more deep copy than is necessary.
|
||||
master_params[0].grad.data.copy_(
|
||||
_flatten_dense_tensors([p.grad.data for p in model_params]))
|
||||
else:
|
||||
for model, master in zip(model_params, master_params):
|
||||
if model.grad is not None:
|
||||
if master.grad is None:
|
||||
master.grad = Variable(master.data.new(*master.data.size()))
|
||||
master.grad.data.copy_(model.grad.data)
|
||||
else:
|
||||
master.grad = None
|
||||
|
||||
|
||||
def master_params_to_model_params(model_params, master_params, flat_master=False):
|
||||
"""
|
||||
Copy master parameters to model parameters.
|
||||
|
||||
Args:
|
||||
model_params: List of model parameters created by :func:`prep_param_lists`.
|
||||
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
|
||||
"""
|
||||
if flat_master:
|
||||
for model, master in zip(model_params,
|
||||
_unflatten_dense_tensors(master_params[0].data, model_params)):
|
||||
model.data.copy_(master)
|
||||
else:
|
||||
for model, master in zip(model_params, master_params):
|
||||
model.data.copy_(master.data)
|
||||
|
||||
# Backward compatibility fixes
|
||||
|
||||
def to_python_float(t):
|
||||
if hasattr(t, 'item'):
|
||||
return t.item()
|
||||
else:
|
||||
return t[0]
|
||||
|
||||
TORCH_MAJOR = int(torch.__version__.split('.')[0])
|
||||
TORCH_MINOR = int(torch.__version__.split('.')[1])
|
||||
|
||||
clip_grad_norm = mpu.clip_grad_norm
|
||||
#elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
|
||||
# clip_grad_norm = torch.nn.utils.clip_grad_norm
|
||||
#else:
|
||||
# clip_grad_norm = torch.nn.utils.clip_grad_norm_
|
||||
237
modelscope/utils/nlp/fp16/loss_scaler.py
Executable file
237
modelscope/utils/nlp/fp16/loss_scaler.py
Executable file
@@ -0,0 +1,237 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
from sofa.utils import mpu
|
||||
|
||||
# item() is a recent addition, so this helps with backward compatibility.
|
||||
def to_python_float(t):
|
||||
if hasattr(t, 'item'):
|
||||
return t.item()
|
||||
else:
|
||||
return t[0]
|
||||
|
||||
class LossScaler:
|
||||
"""
|
||||
Class that manages a static loss scale. This class is intended to interact with
|
||||
:class:`FP16_Optimizer`, and should not be directly manipulated by the user.
|
||||
|
||||
Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
|
||||
:class:`FP16_Optimizer`'s constructor.
|
||||
|
||||
Args:
|
||||
scale (float, optional, default=1.0): The loss scale.
|
||||
"""
|
||||
|
||||
def __init__(self, scale=1):
|
||||
self.cur_scale = scale
|
||||
|
||||
# `params` is a list / generator of torch.Variable
|
||||
def has_overflow(self, params):
|
||||
return False
|
||||
|
||||
# `x` is a torch.Tensor
|
||||
def _has_inf_or_nan(x):
|
||||
return False
|
||||
|
||||
def update_scale(self, overflow):
|
||||
pass
|
||||
|
||||
@property
|
||||
def loss_scale(self):
|
||||
return self.cur_scale
|
||||
|
||||
def scale_gradient(self, module, grad_in, grad_out):
|
||||
return tuple(self.loss_scale * g for g in grad_in)
|
||||
|
||||
def backward(self, loss, retain_graph=False):
|
||||
scaled_loss = loss*self.loss_scale
|
||||
scaled_loss.backward(retain_graph=retain_graph)
|
||||
|
||||
class DynamicLossScaler:
|
||||
"""
|
||||
Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler`
|
||||
indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
|
||||
:class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler`
|
||||
operates, because the default options can be changed using the
|
||||
the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
|
||||
|
||||
Loss scaling is designed to combat the problem of underflowing gradients encountered at long
|
||||
times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss
|
||||
scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are
|
||||
encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
|
||||
occurred.
|
||||
:class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
|
||||
and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
|
||||
If a certain number of iterations occur without overflowing gradients detected,
|
||||
:class:`DynamicLossScaler` increases the loss scale once more.
|
||||
In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
|
||||
always using the highest loss scale possible without incurring overflow.
|
||||
|
||||
Args:
|
||||
init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.`
|
||||
scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
|
||||
scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
init_scale=2**32,
|
||||
scale_factor=2.,
|
||||
scale_window=1000,
|
||||
min_scale=1,
|
||||
delayed_shift=1,
|
||||
consecutive_hysteresis=False):
|
||||
self.cur_scale = init_scale
|
||||
self.cur_iter = 0
|
||||
self.last_overflow_iter = -1
|
||||
self.scale_factor = scale_factor
|
||||
self.scale_window = scale_window
|
||||
self.min_scale = min_scale
|
||||
self.delayed_shift = delayed_shift
|
||||
self.cur_hysteresis = delayed_shift
|
||||
self.consecutive_hysteresis = consecutive_hysteresis
|
||||
|
||||
# `params` is a list / generator of torch.Variable
|
||||
def has_overflow_serial(self, params):
|
||||
for p in params:
|
||||
if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def has_overflow(self, params):
|
||||
overflow = self.has_overflow_serial(params)
|
||||
# Since each model parallel GPU carries only part of the model,
|
||||
# make sure overflow flag is synced across all the model parallel GPUs
|
||||
overflow_gpu = torch.cuda.ByteTensor([overflow])
|
||||
torch.distributed.all_reduce(overflow_gpu,
|
||||
op=torch.distributed.ReduceOp.MAX,
|
||||
group=mpu.get_model_parallel_group())
|
||||
overflow = overflow_gpu[0].item()
|
||||
return bool(overflow)
|
||||
|
||||
|
||||
# `x` is a torch.Tensor
|
||||
def _has_inf_or_nan(x):
|
||||
try:
|
||||
# if x is half, the .float() incurs an additional deep copy, but it's necessary if
|
||||
# Pytorch's .sum() creates a one-element tensor of the same type as x
|
||||
# (which is true for some recent version of pytorch).
|
||||
cpu_sum = float(x.float().sum())
|
||||
# More efficient version that can be used if .sum() returns a Python scalar
|
||||
# cpu_sum = float(x.sum())
|
||||
except RuntimeError as instance:
|
||||
# We want to check if inst is actually an overflow exception.
|
||||
# RuntimeError could come from a different error.
|
||||
# If so, we still want the exception to propagate.
|
||||
if "value cannot be converted" not in instance.args[0]:
|
||||
raise
|
||||
return True
|
||||
else:
|
||||
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
|
||||
return True
|
||||
return False
|
||||
|
||||
# `overflow` is boolean indicating whether the gradient overflowed
|
||||
def update_scale(self, overflow):
|
||||
|
||||
if not hasattr(self, 'min_scale'):
|
||||
self.min_scale = 1
|
||||
if not hasattr(self, 'delayed_shift'):
|
||||
self.delayed_shift = 1
|
||||
if not hasattr(self, 'cur_hysteresis'):
|
||||
self.cur_hysteresis = 1
|
||||
if not hasattr(self, 'consecutive_hysteresis'):
|
||||
self.consecutive_hysteresis = True
|
||||
if overflow:
|
||||
# self.cur_scale /= self.scale_factor
|
||||
if self.delayed_shift == 1 or self.cur_hysteresis == 1:
|
||||
self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale)
|
||||
else:
|
||||
self.cur_hysteresis -= 1
|
||||
self.last_overflow_iter = self.cur_iter
|
||||
else:
|
||||
if self.consecutive_hysteresis:
|
||||
self.cur_hysteresis = self.delayed_shift
|
||||
if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
|
||||
if not self.consecutive_hysteresis:
|
||||
self.cur_hysteresis = self.delayed_shift
|
||||
self.cur_scale *= self.scale_factor
|
||||
self.cur_iter += 1
|
||||
|
||||
@property
|
||||
def loss_scale(self):
|
||||
return self.cur_scale
|
||||
|
||||
def scale_gradient(self, module, grad_in, grad_out):
|
||||
return tuple(self.loss_scale * g for g in grad_in)
|
||||
|
||||
def backward(self, loss, retain_graph=False):
|
||||
scaled_loss = loss*self.loss_scale
|
||||
scaled_loss.backward(retain_graph=retain_graph)
|
||||
|
||||
##############################################################
|
||||
# Example usage below here -- assuming it's in a separate file
|
||||
##############################################################
|
||||
"""
|
||||
TO-DO separate out into an example.
|
||||
if __name__ == "__main__":
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
from dynamic_loss_scaler import DynamicLossScaler
|
||||
|
||||
# N is batch size; D_in is input dimension;
|
||||
# H is hidden dimension; D_out is output dimension.
|
||||
N, D_in, H, D_out = 64, 1000, 100, 10
|
||||
|
||||
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
|
||||
x = Variable(torch.randn(N, D_in), requires_grad=False)
|
||||
y = Variable(torch.randn(N, D_out), requires_grad=False)
|
||||
|
||||
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
|
||||
w2 = Variable(torch.randn(H, D_out), requires_grad=True)
|
||||
parameters = [w1, w2]
|
||||
|
||||
learning_rate = 1e-6
|
||||
optimizer = torch.optim.SGD(parameters, lr=learning_rate)
|
||||
loss_scaler = DynamicLossScaler()
|
||||
|
||||
for t in range(500):
|
||||
y_pred = x.mm(w1).clamp(min=0).mm(w2)
|
||||
loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
|
||||
print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
|
||||
print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
|
||||
print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
|
||||
|
||||
# Run backprop
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
# Check for overflow
|
||||
has_overflow = DynamicLossScaler.has_overflow(parameters)
|
||||
|
||||
# If no overflow, unscale grad and update as usual
|
||||
if not has_overflow:
|
||||
for param in parameters:
|
||||
param.grad.data.mul_(1. / loss_scaler.loss_scale)
|
||||
optimizer.step()
|
||||
# Otherwise, don't do anything -- ie, skip iteration
|
||||
else:
|
||||
print('OVERFLOW!')
|
||||
|
||||
# Update loss scale for next iteration
|
||||
loss_scaler.update_scale(has_overflow)
|
||||
|
||||
"""
|
||||
102
modelscope/utils/nlp/load_checkpoint.py
Executable file
102
modelscope/utils/nlp/load_checkpoint.py
Executable file
@@ -0,0 +1,102 @@
|
||||
import os
|
||||
import torch
|
||||
|
||||
def load_checkpoint(model,
|
||||
load_dir,
|
||||
tag,
|
||||
load_module_strict=True,
|
||||
load_optimizer_states=True,
|
||||
load_lr_scheduler_states=True):
|
||||
r"""Load training checkpoint
|
||||
|
||||
Arguments:
|
||||
load_dir: Required. Directory to load the checkpoint from
|
||||
tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step.
|
||||
load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and checkpoint match.
|
||||
load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance
|
||||
load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
|
||||
Return:
|
||||
load_path: Path of the loaded checkpoint. None if loading the checkpoint failed
|
||||
client_state: State dictionary used for loading required training states in the client code.
|
||||
"""
|
||||
|
||||
load_path, client_states = _load_checkpoint(model,
|
||||
load_dir,
|
||||
tag,
|
||||
load_module_strict=load_module_strict,
|
||||
load_optimizer_states=load_optimizer_states,
|
||||
load_lr_scheduler_states=load_lr_scheduler_states)
|
||||
|
||||
if load_optimizer_states:
|
||||
if model.zero_optimization() and load_path is not None:
|
||||
model._load_zero_checkpoint(load_dir,
|
||||
tag,
|
||||
load_optimizer_states=load_optimizer_states)
|
||||
|
||||
return load_path, client_states
|
||||
|
||||
def _get_ckpt_name(mpu, checkpoints_path, tag):
|
||||
mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank()
|
||||
ckpt_name = os.path.join(checkpoints_path,
|
||||
str(tag),
|
||||
'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt')
|
||||
return ckpt_name
|
||||
|
||||
def pre_load(mpu,
|
||||
load_dir,
|
||||
tag=''):
|
||||
load_path = _get_ckpt_name(mpu, load_dir, tag)
|
||||
checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage)
|
||||
return checkpoint['module']
|
||||
|
||||
def _load_checkpoint(model,
|
||||
load_dir,
|
||||
tag,
|
||||
load_module_strict=True,
|
||||
load_optimizer_states=True,
|
||||
load_lr_scheduler_states=True):
|
||||
|
||||
load_path = model._get_ckpt_name(load_dir, tag)
|
||||
|
||||
if not os.path.exists(load_path):
|
||||
return None, None
|
||||
|
||||
checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage)
|
||||
|
||||
model.load_module_state_dict(state_dict=checkpoint['module'],
|
||||
strict=load_module_strict)
|
||||
if not model.zero_optimization() and load_optimizer_states:
|
||||
if model.fp16_enabled():
|
||||
model.optimizer.load_state_dict(
|
||||
checkpoint['optimizer'],
|
||||
load_optimizer_states=load_optimizer_states)
|
||||
elif load_optimizer_states:
|
||||
model.optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
|
||||
if load_lr_scheduler_states and model.lr_scheduler is not None:
|
||||
model.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
|
||||
|
||||
model.csr_tensor_module_names = checkpoint['csr_tensor_module_names']
|
||||
model.global_steps = checkpoint['global_steps']
|
||||
model.global_samples = checkpoint.get('global_samples',
|
||||
model.global_steps * model.train_batch_size())
|
||||
model.skipped_steps = checkpoint['skipped_steps']
|
||||
model.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
|
||||
model.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
|
||||
deepspeed_states = [
|
||||
'module',
|
||||
'optimizer',
|
||||
'lr_scheduler',
|
||||
'csr_tensor_module_names',
|
||||
'skipped_steps',
|
||||
'global_steps',
|
||||
'dp_world_size',
|
||||
'mp_world_size'
|
||||
]
|
||||
client_state = {
|
||||
key: value
|
||||
for key,
|
||||
value in checkpoint.items() if not key in deepspeed_states
|
||||
}
|
||||
|
||||
return load_path, client_state
|
||||
55
modelscope/utils/nlp/mpu/__init__.py
Executable file
55
modelscope/utils/nlp/mpu/__init__.py
Executable file
@@ -0,0 +1,55 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Model parallel utility interface."""
|
||||
|
||||
from .cross_entropy import vocab_parallel_cross_entropy
|
||||
|
||||
from .data import broadcast_data
|
||||
|
||||
from .grads import clip_grad_norm
|
||||
|
||||
from .initialize import destroy_model_parallel
|
||||
from .initialize import get_data_parallel_group
|
||||
from .initialize import get_data_parallel_rank
|
||||
from .initialize import get_data_parallel_world_size
|
||||
from .initialize import get_model_parallel_group
|
||||
from .initialize import get_model_parallel_rank
|
||||
from .initialize import get_model_parallel_src_rank
|
||||
from .initialize import get_model_parallel_world_size
|
||||
from .initialize import initialize_model_parallel
|
||||
from .initialize import model_parallel_is_initialized
|
||||
|
||||
from .layers import ColumnParallelLinear
|
||||
from .layers import ParallelEmbedding
|
||||
from .layers import RowParallelLinear
|
||||
from .layers import VocabParallelEmbedding
|
||||
|
||||
from .mappings import copy_to_model_parallel_region
|
||||
from .mappings import gather_from_model_parallel_region
|
||||
from .mappings import reduce_from_model_parallel_region
|
||||
from .mappings import scatter_to_model_parallel_region
|
||||
from .mappings import _gather
|
||||
|
||||
from .random import checkpoint
|
||||
from .random import partition_activations_in_checkpoint
|
||||
from .random import get_cuda_rng_tracker
|
||||
from .random import model_parallel_cuda_manual_seed
|
||||
|
||||
from .transformer import BertParallelSelfAttention
|
||||
from .transformer import BertParallelTransformerLayer
|
||||
from .transformer import GPT2ParallelTransformer, GPT2ParallelSelfAttention
|
||||
from .transformer import PalmParallelCrossAttention
|
||||
from .transformer import LayerNorm
|
||||
209
modelscope/utils/nlp/mpu/binarizer.py
Normal file
209
modelscope/utils/nlp/mpu/binarizer.py
Normal file
@@ -0,0 +1,209 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020-present, AllenAI Authors, University of Illinois Urbana-Champaign,
|
||||
# Intel Nervana Systems and the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Binarizers take a (real value) matrix as input and produce a binary (values in {0,1}) mask of the same shape.
|
||||
"""
|
||||
|
||||
import torch
|
||||
from torch import autograd
|
||||
|
||||
|
||||
class ThresholdBinarizer(autograd.Function):
|
||||
"""
|
||||
Thresholdd binarizer.
|
||||
Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j} > \tau`
|
||||
where `\tau` is a real value threshold.
|
||||
|
||||
Implementation is inspired from:
|
||||
https://github.com/arunmallya/piggyback
|
||||
Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights
|
||||
Arun Mallya, Dillon Davis, Svetlana Lazebnik
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
|
||||
"""
|
||||
Args:
|
||||
inputs (`torch.FloatTensor`)
|
||||
The input matrix from which the binarizer computes the binary mask.
|
||||
threshold (`float`)
|
||||
The threshold value (in R).
|
||||
sigmoid (`bool`)
|
||||
If set to ``True``, we apply the sigmoid function to the `inputs` matrix before comparing to `threshold`.
|
||||
In this case, `threshold` should be a value between 0 and 1.
|
||||
Returns:
|
||||
mask (`torch.FloatTensor`)
|
||||
Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
|
||||
retained, 0 - the associated weight is pruned).
|
||||
"""
|
||||
nb_elems = inputs.numel()
|
||||
nb_min = int(0.005 * nb_elems) + 1
|
||||
if sigmoid:
|
||||
mask = (torch.sigmoid(inputs) > threshold).type(inputs.type()).bool()
|
||||
else:
|
||||
mask = (inputs > threshold).type(inputs.type()).bool()
|
||||
if mask.sum() < nb_min:
|
||||
# We limit the pruning so that at least 0.5% (half a percent) of the weights are remaining
|
||||
k_threshold = inputs.flatten().kthvalue(max(nb_elems - nb_min, 1)).values
|
||||
mask = (inputs > k_threshold).type(inputs.type()).bool()
|
||||
return mask
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, gradOutput):
|
||||
return gradOutput, None, None
|
||||
|
||||
|
||||
class TopKBinarizer(autograd.Function):
|
||||
"""
|
||||
Top-k Binarizer.
|
||||
Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
|
||||
is among the k% highest values of S.
|
||||
|
||||
Implementation is inspired from:
|
||||
https://github.com/allenai/hidden-networks
|
||||
What's hidden in a randomly weighted neural network?
|
||||
Vivek Ramanujan*, Mitchell Wortsman*, Aniruddha Kembhavi, Ali Farhadi, Mohammad Rastegari
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, inputs: torch.tensor, threshold: float, k_threshold=None):
|
||||
"""
|
||||
Args:
|
||||
inputs (`torch.FloatTensor`)
|
||||
The input matrix from which the binarizer computes the binary mask.
|
||||
threshold (`float`)
|
||||
The percentage of weights to keep (the rest is pruned).
|
||||
`threshold` is a float between 0 and 1.
|
||||
Returns:
|
||||
mask (`torch.FloatTensor`)
|
||||
Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
|
||||
retained, 0 - the associated weight is pruned).
|
||||
"""
|
||||
# Get the subnetwork by sorting the inputs and using the top threshold %
|
||||
if k_threshold is None:
|
||||
mask = inputs.clone()
|
||||
_, idx = inputs.flatten().sort(descending=True)
|
||||
j = int(threshold * inputs.numel())
|
||||
# flat_out and mask access the same memory.
|
||||
flat_out = mask.flatten()
|
||||
flat_out[idx[j:]] = 0
|
||||
flat_out[idx[:j]] = 1
|
||||
|
||||
# if threshold == 1:
|
||||
# k_threshold = -1000
|
||||
# else:
|
||||
# n = inputs.numel()
|
||||
# kth = min(max(n - (int(n * threshold) + 1), 1), n)
|
||||
# k_threshold = inputs.flatten().kthvalue(kth).values
|
||||
# mask = (inputs > k_threshold).type(inputs.type())
|
||||
else:
|
||||
if threshold == 1.0:
|
||||
mask = (inputs > -1000).type(inputs.type())
|
||||
else:
|
||||
mask = (inputs > k_threshold).type(inputs.type())
|
||||
|
||||
# # Get the subnetwork by get the kthvalue
|
||||
# # ==> This method will cause bug since if all the mask_scores are the same, the mask is all zero.
|
||||
# n = inputs.numel()
|
||||
# kth = max(n - (int(n * threshold) + 1), 1)
|
||||
# k_threshold = inputs.flatten().kthvalue(kth).values
|
||||
# mask = (inputs > k_threshold).type(inputs.type())
|
||||
|
||||
# if torch.distributed.get_rank() == 0:
|
||||
# print("inputs:")
|
||||
# print(inputs, flush=True)
|
||||
# print('inputs isinf:')
|
||||
# print(torch.isinf(inputs), flush=True)
|
||||
# print('inputs isinf number:')
|
||||
# print(torch.isinf(inputs).sum(), flush=True)
|
||||
#
|
||||
# print('\n\n\nMask:')
|
||||
# print(mask, flush=True)
|
||||
# print('Mask isinf:')
|
||||
# print(torch.isinf(mask), flush=True)
|
||||
# print('Mask isinf number:')
|
||||
# print(torch.isinf(mask).sum(), flush=True)
|
||||
# print('Mask sum:')
|
||||
# print(torch.sum(mask), flush=True)
|
||||
#
|
||||
# print('inputs (mask_scores).mean(): ', inputs.mean().detach().cpu(), flush=True)
|
||||
# print('inputs (mask_scores).max(): ', inputs.max().detach().cpu(), flush=True)
|
||||
# print('inputs (mask_scores).min(): ', inputs.min().detach().cpu(), flush=True)
|
||||
# print('inputs is all 0?', (inputs != torch.tensor(0).type(inputs.type())).sum().detach().cpu().numpy())
|
||||
# print("\n\n\nMask ratio: {}/{}={}".format(mask.sum().detach().cpu(), inputs.numel(), (mask.sum().detach().cpu().numpy()) / float(inputs.numel())), flush=True)
|
||||
# print("threshold: {}".format(threshold), flush=True)
|
||||
|
||||
return mask
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, gradOutput):
|
||||
return gradOutput, None, None
|
||||
|
||||
|
||||
class MagnitudeBinarizer(object):
|
||||
"""
|
||||
Magnitude Binarizer.
|
||||
Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
|
||||
is among the k% highest values of |S| (absolute value).
|
||||
|
||||
Implementation is inspired from https://github.com/NervanaSystems/distiller/blob/2291fdcc2ea642a98d4e20629acb5a9e2e04b4e6/distiller/pruning/automated_gradual_pruner.py#L24
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def apply(inputs: torch.tensor, threshold: float):
|
||||
"""
|
||||
Args:
|
||||
inputs (`torch.FloatTensor`)
|
||||
The input matrix from which the binarizer computes the binary mask.
|
||||
This input marix is typically the weight matrix.
|
||||
threshold (`float`)
|
||||
The percentage of weights to keep (the rest is pruned).
|
||||
`threshold` is a float between 0 and 1.
|
||||
Returns:
|
||||
mask (`torch.FloatTensor`)
|
||||
Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
|
||||
retained, 0 - the associated weight is pruned).
|
||||
"""
|
||||
# Get the subnetwork by sorting the inputs and using the top threshold %
|
||||
mask = inputs.clone()
|
||||
_, idx = inputs.abs().flatten().sort(descending=True)
|
||||
j = int(threshold * inputs.numel())
|
||||
# flat_out and mask access the same memory.
|
||||
flat_out = mask.flatten()
|
||||
flat_out[idx[j:]] = 0
|
||||
flat_out[idx[:j]] = 1
|
||||
# mask = mask.bool()
|
||||
return mask
|
||||
|
||||
# # Get the subnetwork by sorting the inputs and using the top threshold
|
||||
# # ==> This method will cause bug since if all the mask_scores are the same, the mask is all zero.
|
||||
# n = inputs.numel()
|
||||
# kth = max(n - (int(n * threshold) + 1), 1)
|
||||
# k_threshold = inputs.abs().flatten().kthvalue(kth).values
|
||||
# mask = (inputs > k_threshold).type(inputs.type())
|
||||
# return mask
|
||||
|
||||
class MaskTaylor(autograd.Function):
|
||||
@staticmethod
|
||||
def forward(ctx, weight, mask):
|
||||
ctx.save_for_backward(weight, mask)
|
||||
return mask*weight
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, gradOutput):
|
||||
weight, mask, = ctx.saved_tensors
|
||||
return gradOutput*mask, -torch.pow(gradOutput*weight, 2)
|
||||
# return gradOutput*mask, -torch.abs(gradOutput*weight)
|
||||
109
modelscope/utils/nlp/mpu/cross_entropy.py
Executable file
109
modelscope/utils/nlp/mpu/cross_entropy.py
Executable file
@@ -0,0 +1,109 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from .initialize import get_model_parallel_group
|
||||
from .initialize import get_model_parallel_rank
|
||||
from .initialize import get_model_parallel_world_size
|
||||
from .utils import VocabUtility
|
||||
|
||||
|
||||
class _VocabParallelCrossEntropy(torch.autograd.Function):
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, vocab_parallel_logits, target):
|
||||
|
||||
# Copy so the input remains unchanged.
|
||||
logits = vocab_parallel_logits.clone()
|
||||
# Maximum value along vocab dimension across all GPUs.
|
||||
logits_max = torch.max(logits, dim=-1)[0]
|
||||
torch.distributed.all_reduce(logits_max,
|
||||
op=torch.distributed.ReduceOp.MAX,
|
||||
group=get_model_parallel_group())
|
||||
# Subtract the maximum value.
|
||||
logits.sub_(logits_max.unsqueeze(dim=-1))
|
||||
# Sum of exponential of logits along vocab dimension across all GPUs.
|
||||
exp_logits = logits.exp()
|
||||
sum_exp_logits = exp_logits.sum(dim=-1)
|
||||
torch.distributed.all_reduce(sum_exp_logits,
|
||||
op=torch.distributed.ReduceOp.SUM,
|
||||
group=get_model_parallel_group())
|
||||
|
||||
# Get the partition's vocab indecies
|
||||
get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
|
||||
partition_vocab_size = vocab_parallel_logits.size()[-1]
|
||||
rank = get_model_parallel_rank()
|
||||
world_size = get_model_parallel_world_size()
|
||||
vocab_start_index, vocab_end_index = get_vocab_range(
|
||||
partition_vocab_size, rank, world_size)
|
||||
|
||||
# Create a mask of valid vocab ids (1 means it needs to be masked).
|
||||
target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
|
||||
masked_target = target.clone() - vocab_start_index
|
||||
masked_target[target_mask] = 0
|
||||
|
||||
# Get predicted-logits = logits[target].
|
||||
# For Simplicity, we convert logits to a 2-D tensor with size
|
||||
# [*, partition-vocab-size] and target to a 1-D tensor of size [*].
|
||||
logits_2d = logits.view(-1, partition_vocab_size)
|
||||
masked_target_1d = masked_target.view(-1)
|
||||
arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
|
||||
device=logits_2d.device)
|
||||
predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
|
||||
predicted_logits = predicted_logits_1d.view_as(target)
|
||||
predicted_logits[target_mask] = 0.0
|
||||
# All reduce is needed to get the chunks from other GPUs.
|
||||
torch.distributed.all_reduce(predicted_logits,
|
||||
op=torch.distributed.ReduceOp.SUM,
|
||||
group=get_model_parallel_group())
|
||||
|
||||
# Loss = log(sum(exp(logits))) - predicted-logit.
|
||||
loss = torch.log(sum_exp_logits) - predicted_logits
|
||||
|
||||
# Store softmax, target-mask and masked-target for backward pass.
|
||||
exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
|
||||
ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
|
||||
|
||||
return loss
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
|
||||
# Retreive tensors from the forward path.
|
||||
softmax, target_mask, masked_target_1d = ctx.saved_tensors
|
||||
|
||||
# All the inputs have softmax as thier gradient.
|
||||
grad_input = softmax
|
||||
# For simplicity, work with the 2D gradient.
|
||||
partition_vocab_size = softmax.size()[-1]
|
||||
grad_2d = grad_input.view(-1, partition_vocab_size)
|
||||
|
||||
# Add the gradient from matching classes.
|
||||
arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
|
||||
device=grad_2d.device)
|
||||
grad_2d[arange_1d, masked_target_1d] -= (
|
||||
1.0 - target_mask.view(-1).float())
|
||||
|
||||
# Finally elementwise multiplication with the output gradients.
|
||||
grad_input.mul_(grad_output.unsqueeze(dim=-1))
|
||||
|
||||
return grad_input, None
|
||||
|
||||
|
||||
def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
|
||||
"""Helper function for the cross entropy."""
|
||||
return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
|
||||
116
modelscope/utils/nlp/mpu/data.py
Executable file
116
modelscope/utils/nlp/mpu/data.py
Executable file
@@ -0,0 +1,116 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
from .initialize import get_model_parallel_group
|
||||
from .initialize import get_model_parallel_rank
|
||||
from .initialize import get_model_parallel_src_rank
|
||||
|
||||
|
||||
_MAX_DATA_DIM = 5
|
||||
|
||||
|
||||
def _check_data_types(keys, data, target_dtype):
|
||||
"""Check that all the keys have the same target data type."""
|
||||
for key in keys:
|
||||
assert data[key].dtype == target_dtype, '{} has data type {} which '\
|
||||
'is different than {}'.format(key, data[key].dtype, target_dtype)
|
||||
|
||||
|
||||
def _build_key_size_numel_dictionaries(keys, data):
|
||||
"""Build the size on rank 0 and broadcast."""
|
||||
max_dim = _MAX_DATA_DIM
|
||||
sizes = [0 for _ in range(max_dim) for _ in keys]
|
||||
|
||||
# Pack the sizes on rank zero.
|
||||
if get_model_parallel_rank() == 0:
|
||||
offset = 0
|
||||
for key in keys:
|
||||
assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
|
||||
size = data[key].size()
|
||||
for i, s in enumerate(size):
|
||||
sizes[i + offset] = s
|
||||
offset += max_dim
|
||||
|
||||
# Move to GPU and broadcast.
|
||||
sizes_cuda = torch.cuda.LongTensor(sizes)
|
||||
torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(),
|
||||
group=get_model_parallel_group())
|
||||
|
||||
# Move back to cpu and unpack.
|
||||
sizes_cpu = sizes_cuda.cpu()
|
||||
key_size = {}
|
||||
key_numel = {}
|
||||
total_numel = 0
|
||||
offset = 0
|
||||
for key in keys:
|
||||
i = 0
|
||||
size = []
|
||||
numel = 1
|
||||
while sizes_cpu[offset + i] > 0:
|
||||
this_size = sizes_cpu[offset + i]
|
||||
size.append(this_size)
|
||||
numel *= this_size
|
||||
i += 1
|
||||
key_size[key] = size
|
||||
key_numel[key] = numel
|
||||
total_numel += numel
|
||||
offset += max_dim
|
||||
|
||||
return key_size, key_numel, total_numel
|
||||
|
||||
|
||||
def broadcast_data(keys, data, datatype):
|
||||
"""Broadcast data from rank zero of each model parallel group to the
|
||||
members of the same model parallel group.
|
||||
|
||||
Arguments:
|
||||
keys: list of keys in the data disctionary to be broadcasted
|
||||
data: data dictionary of string keys and cpu tensor values.
|
||||
datatype: torch data type of all tensors in data associated
|
||||
with keys.
|
||||
"""
|
||||
# Build (key, size) and (key, number of elements) dictionaries along
|
||||
# with the total number of elements on all ranks.
|
||||
key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
|
||||
data)
|
||||
|
||||
# Pack on rank zero.
|
||||
if get_model_parallel_rank() == 0:
|
||||
# Check that all keys have the same data type.
|
||||
_check_data_types(keys, data, datatype)
|
||||
# Flatten the data associated with the keys
|
||||
flatten_data = torch.cat(
|
||||
[data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
|
||||
else:
|
||||
flatten_data = torch.empty(total_numel,
|
||||
device=torch.cuda.current_device(),
|
||||
dtype=datatype)
|
||||
|
||||
# Boradcast
|
||||
torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(),
|
||||
group=get_model_parallel_group())
|
||||
|
||||
# Unpack
|
||||
output = {}
|
||||
offset = 0
|
||||
for key in keys:
|
||||
size = key_size[key]
|
||||
numel = key_numel[key]
|
||||
output[key] = flatten_data.narrow(0, offset, numel).view(size)
|
||||
offset += numel
|
||||
|
||||
return output
|
||||
74
modelscope/utils/nlp/mpu/grads.py
Executable file
74
modelscope/utils/nlp/mpu/grads.py
Executable file
@@ -0,0 +1,74 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# Parts of the code here are adapted from PyTorch
|
||||
# repo: https://github.com/pytorch/pytorch
|
||||
|
||||
|
||||
import torch
|
||||
from torch._six import inf
|
||||
|
||||
from .initialize import get_model_parallel_group
|
||||
from .initialize import get_model_parallel_rank
|
||||
|
||||
|
||||
def clip_grad_norm(parameters, max_norm, norm_type=2):
|
||||
"""Clips gradient norm of an iterable of parameters.
|
||||
|
||||
This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
|
||||
added functionality to handle model parallel parameters. Note that
|
||||
the gradients are modified in place.
|
||||
|
||||
Arguments:
|
||||
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
|
||||
single Tensor that will have gradients normalized
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
|
||||
infinity norm.
|
||||
|
||||
Returns:
|
||||
Total norm of the parameters (viewed as a single vector).
|
||||
"""
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
parameters = [parameters]
|
||||
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
||||
max_norm = float(max_norm)
|
||||
norm_type = float(norm_type)
|
||||
if norm_type == inf:
|
||||
total_norm = max(p.grad.data.abs().max() for p in parameters)
|
||||
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
|
||||
# Take max across all GPUs.
|
||||
torch.distributed.all_reduce(total_norm_cuda,
|
||||
op=torch.distributed.ReduceOp.MAX,
|
||||
group=get_model_parallel_group())
|
||||
total_norm = total_norm_cuda[0].item()
|
||||
else:
|
||||
total_norm = 0
|
||||
for p in parameters:
|
||||
if p.model_parallel or (get_model_parallel_rank() == 0):
|
||||
param_norm = p.grad.data.norm(norm_type)
|
||||
total_norm += param_norm.item() ** norm_type
|
||||
# Sum across all model parallel GPUs.
|
||||
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
|
||||
torch.distributed.all_reduce(total_norm_cuda,
|
||||
op=torch.distributed.ReduceOp.SUM,
|
||||
group=get_model_parallel_group())
|
||||
total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
|
||||
clip_coef = max_norm / (total_norm + 1e-6)
|
||||
if clip_coef < 1:
|
||||
for p in parameters:
|
||||
p.grad.data.mul_(clip_coef)
|
||||
return total_norm
|
||||
134
modelscope/utils/nlp/mpu/initialize.py
Executable file
134
modelscope/utils/nlp/mpu/initialize.py
Executable file
@@ -0,0 +1,134 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
"""Model and data parallel groups."""
|
||||
|
||||
import torch
|
||||
|
||||
from .utils import ensure_divisibility
|
||||
|
||||
|
||||
# Model parallel group that the current rank belongs to.
|
||||
_MODEL_PARALLEL_GROUP = None
|
||||
# Data parallel group that the current rank belongs to.
|
||||
_DATA_PARALLEL_GROUP = None
|
||||
|
||||
|
||||
def initialize_model_parallel(model_parallel_size_):
|
||||
"""
|
||||
Initialize model data parallel groups.
|
||||
|
||||
Arguments:
|
||||
model_parallel_size: number of GPUs used to parallelize model.
|
||||
|
||||
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
|
||||
use 2 GPUs to parallelize the model. The present function will
|
||||
create 4 model parallel groups and 2 data parallel grous as:
|
||||
4 model parallel groups:
|
||||
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
|
||||
2 data parallel groups:
|
||||
[g0, g2, g4, g6], [g1, g3, g5, g7]
|
||||
Note that for efficiency, the caller should make sure adjacent ranks
|
||||
are on the same DGX box. For example if we are using 2 DGX-1 boxes
|
||||
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
|
||||
ranks 8 to 15 belong to the second box.
|
||||
"""
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> initializing model parallel with size {}'.format(
|
||||
model_parallel_size_))
|
||||
# Get world size and rank. Ensure some consistencies.
|
||||
assert torch.distributed.is_initialized()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
model_parallel_size = min(model_parallel_size_, world_size)
|
||||
ensure_divisibility(world_size, model_parallel_size)
|
||||
rank = torch.distributed.get_rank()
|
||||
# Build the data parallel groups.
|
||||
global _DATA_PARALLEL_GROUP
|
||||
assert _DATA_PARALLEL_GROUP is None, \
|
||||
'data parallel group is already initialized'
|
||||
for i in range(model_parallel_size):
|
||||
ranks = range(i, world_size, model_parallel_size)
|
||||
group = torch.distributed.new_group(ranks)
|
||||
if i == (rank % model_parallel_size):
|
||||
_DATA_PARALLEL_GROUP = group
|
||||
|
||||
# Build the model parallel groups.
|
||||
global _MODEL_PARALLEL_GROUP
|
||||
assert _MODEL_PARALLEL_GROUP is None, \
|
||||
'model parallel group is already initialized'
|
||||
for i in range(world_size // model_parallel_size):
|
||||
ranks = range(i * model_parallel_size,
|
||||
(i + 1) * model_parallel_size)
|
||||
group = torch.distributed.new_group(ranks)
|
||||
if i == (rank // model_parallel_size):
|
||||
_MODEL_PARALLEL_GROUP = group
|
||||
|
||||
|
||||
def model_parallel_is_initialized():
|
||||
"""Check if model and data parallel groups are initialized."""
|
||||
if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def get_model_parallel_group():
|
||||
"""Get the model parallel group the caller rank belongs to."""
|
||||
assert _MODEL_PARALLEL_GROUP is not None, \
|
||||
'model parallel group is not initialized'
|
||||
return _MODEL_PARALLEL_GROUP
|
||||
|
||||
|
||||
def get_data_parallel_group():
|
||||
"""Get the data parallel group the caller rank belongs to."""
|
||||
assert _DATA_PARALLEL_GROUP is not None, \
|
||||
'data parallel group is not initialized'
|
||||
return _DATA_PARALLEL_GROUP
|
||||
|
||||
|
||||
def get_model_parallel_world_size():
|
||||
"""Return world size for the model parallel group."""
|
||||
return torch.distributed.get_world_size(group=get_model_parallel_group())
|
||||
|
||||
|
||||
def get_model_parallel_rank():
|
||||
"""Return my rank for the model parallel group."""
|
||||
return torch.distributed.get_rank(group=get_model_parallel_group())
|
||||
|
||||
|
||||
def get_model_parallel_src_rank():
|
||||
"""Calculate the global rank corresponding to a local rank zeor
|
||||
in the model parallel group."""
|
||||
global_rank = torch.distributed.get_rank()
|
||||
local_world_size = get_model_parallel_world_size()
|
||||
return (global_rank // local_world_size) * local_world_size
|
||||
|
||||
|
||||
def get_data_parallel_world_size():
|
||||
"""Return world size for the data parallel group."""
|
||||
return torch.distributed.get_world_size(group=get_data_parallel_group())
|
||||
|
||||
|
||||
def get_data_parallel_rank():
|
||||
"""Return my rank for the data parallel group."""
|
||||
return torch.distributed.get_rank(group=get_data_parallel_group())
|
||||
|
||||
|
||||
def destroy_model_parallel():
|
||||
"""Set the groups to none."""
|
||||
global _MODEL_PARALLEL_GROUP
|
||||
_MODEL_PARALLEL_GROUP = None
|
||||
global _DATA_PARALLEL_GROUP
|
||||
_DATA_PARALLEL_GROUP = None
|
||||
420
modelscope/utils/nlp/mpu/layers.py
Executable file
420
modelscope/utils/nlp/mpu/layers.py
Executable file
@@ -0,0 +1,420 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# Parts of the code here are adapted from PyTorch
|
||||
# repo: https://github.com/pytorch/pytorch
|
||||
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn.init as init
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
|
||||
|
||||
from .initialize import get_model_parallel_rank
|
||||
from .initialize import get_model_parallel_world_size
|
||||
from .mappings import copy_to_model_parallel_region
|
||||
from .mappings import gather_from_model_parallel_region
|
||||
from .mappings import reduce_from_model_parallel_region
|
||||
from .mappings import scatter_to_model_parallel_region
|
||||
from .random import get_cuda_rng_tracker
|
||||
from .utils import divide
|
||||
from .utils import split_tensor_along_last_dim
|
||||
from .utils import VocabUtility
|
||||
from deepspeed.utils.timer import SynchronizedWallClockTimer
|
||||
|
||||
from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer, MaskTaylor
|
||||
|
||||
def _initialize_affine_weight(weight, output_size, input_size,
|
||||
per_partition_size, partition_dim, init_method,
|
||||
stride=1, return_master_weight=False):
|
||||
"""Initialize affine weight for model parallel.
|
||||
|
||||
Build the master weight on all processes and scatter
|
||||
the relevant chunk."""
|
||||
# If we only use 1 process for model parallelism, bypass scatter.
|
||||
world_size = get_model_parallel_world_size()
|
||||
if world_size == 1:
|
||||
init_method(weight)
|
||||
if return_master_weight:
|
||||
return weight
|
||||
return None
|
||||
|
||||
# Initialize master weight
|
||||
master_weight = torch.empty(output_size, input_size,
|
||||
dtype=weight.dtype,
|
||||
requires_grad=False)
|
||||
init_method(master_weight)
|
||||
|
||||
# Split and copy
|
||||
per_partition_per_stride_size = divide(per_partition_size, stride)
|
||||
weight_list = torch.split(master_weight, per_partition_per_stride_size,
|
||||
dim=partition_dim)
|
||||
rank = get_model_parallel_rank()
|
||||
my_weight_list = weight_list[rank::world_size]
|
||||
|
||||
with torch.no_grad():
|
||||
torch.cat(my_weight_list, dim=partition_dim, out=weight)
|
||||
if return_master_weight:
|
||||
return master_weight
|
||||
return None
|
||||
|
||||
|
||||
class VocabParallelEmbedding(torch.nn.Module):
|
||||
"""Embedding parallelized in the vocabulary dimension.
|
||||
|
||||
This is mainly adapted from torch.nn.Embedding and all the default
|
||||
values are kept.
|
||||
Arguments:
|
||||
num_embeddings: vocabulary size.
|
||||
embedding_dim: size of hidden state.
|
||||
init_method: method to initialize weights.
|
||||
"""
|
||||
def __init__(self, num_embeddings, embedding_dim,
|
||||
init_method=init.xavier_normal_):
|
||||
super(VocabParallelEmbedding, self).__init__()
|
||||
# Keep the input dimensions.
|
||||
self.num_embeddings = num_embeddings
|
||||
self.embedding_dim = embedding_dim
|
||||
# Set the detauls for compatibility.
|
||||
self.padding_idx = None
|
||||
self.max_norm = None
|
||||
self.norm_type = 2.
|
||||
self.scale_grad_by_freq = False
|
||||
self.sparse = False
|
||||
self._weight = None
|
||||
# Divide the weight matrix along the vocaburaly dimension.
|
||||
self.vocab_start_index, self.vocab_end_index = \
|
||||
VocabUtility.vocab_range_from_global_vocab_size(
|
||||
self.num_embeddings, get_model_parallel_rank(),
|
||||
get_model_parallel_world_size())
|
||||
self.num_embeddings_per_partition = self.vocab_end_index - \
|
||||
self.vocab_start_index
|
||||
|
||||
# Allocate weights.
|
||||
self.weight = Parameter(torch.Tensor(self.num_embeddings_per_partition,
|
||||
self.embedding_dim))
|
||||
self.weight.model_parallel = True
|
||||
# And initialize.
|
||||
_initialize_affine_weight(
|
||||
self.weight, self.num_embeddings, self.embedding_dim,
|
||||
self.num_embeddings_per_partition, 0, init_method)
|
||||
self.timers = SynchronizedWallClockTimer()
|
||||
|
||||
def forward(self, input_):
|
||||
#self.timers('embedding').start()
|
||||
# Build the mask.
|
||||
input_mask = (input_ < self.vocab_start_index) | \
|
||||
(input_ >= self.vocab_end_index)
|
||||
# Mask the input.
|
||||
masked_input = input_.clone() - self.vocab_start_index
|
||||
masked_input[input_mask] = 0
|
||||
# Get the embeddings.
|
||||
output_parallel = F.embedding(masked_input, self.weight,
|
||||
self.padding_idx, self.max_norm,
|
||||
self.norm_type, self.scale_grad_by_freq,
|
||||
self.sparse)
|
||||
# Mask the output embedding.
|
||||
output_parallel[input_mask, :] = 0.0
|
||||
# Reduce across all the model parallel GPUs.
|
||||
#self.timers('embedding').stop()
|
||||
#self.timers('embedding reduce').start()
|
||||
output = reduce_from_model_parallel_region(output_parallel)
|
||||
#self.timers('embedding reduce').stop()
|
||||
#timer_names = ['embedding', 'embedding reduce']
|
||||
#self.timers.log(names=timer_names)
|
||||
return output
|
||||
|
||||
|
||||
class ParallelEmbedding(torch.nn.Module):
|
||||
"""Embedding parallelized in the embedding dimension.
|
||||
|
||||
This is mainly adapted from torch.nn.Embedding and all the default
|
||||
values are kept.
|
||||
Arguments:
|
||||
num_embeddings: vocabulary size.
|
||||
embedding_dim: size of hidden state.
|
||||
init_method: method to initialize weights.
|
||||
"""
|
||||
def __init__(self, num_embeddings, embedding_dim,
|
||||
init_method=init.xavier_normal_,
|
||||
keep_master_weight_for_test=False):
|
||||
super(ParallelEmbedding, self).__init__()
|
||||
# Keep the input dimensions.
|
||||
self.num_embeddings = num_embeddings
|
||||
self.embedding_dim = embedding_dim
|
||||
# Set some detauls for compatibility.
|
||||
self.padding_idx = None
|
||||
self.max_norm = None
|
||||
self.norm_type = 2.
|
||||
self.scale_grad_by_freq = False
|
||||
self.sparse = False
|
||||
self._weight = None
|
||||
# Divide the weight matrix along the embedding dimension.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.embedding_dim_per_partition = divide(self.embedding_dim,
|
||||
world_size)
|
||||
|
||||
# Allocate weights.
|
||||
self.weight = Parameter(torch.Tensor(self.num_embeddings,
|
||||
self.embedding_dim_per_partition))
|
||||
self.weight.model_parallel = True
|
||||
# And initialize.
|
||||
_initialize_affine_weight(
|
||||
self.weight, self.num_embeddings, self.embedding_dim,
|
||||
self.embedding_dim_per_partition, 1, init_method,
|
||||
stride=1, return_master_weight=False)
|
||||
|
||||
def forward(self, input_):
|
||||
input_parallel = copy_to_model_parallel_region(input_)
|
||||
output_parallel = F.embedding(input_parallel, self.weight,
|
||||
self.padding_idx, self.max_norm,
|
||||
self.norm_type, self.scale_grad_by_freq,
|
||||
self.sparse)
|
||||
output = gather_from_model_parallel_region(output_parallel)
|
||||
return output
|
||||
|
||||
|
||||
class ColumnParallelLinear(torch.nn.Module):
|
||||
"""Linear layer with column parallelism.
|
||||
|
||||
The linear layer is defined as Y = XA + b. A is parallelized along
|
||||
its second dimension as A = [A_1, ..., A_p].
|
||||
|
||||
Arguments:
|
||||
input_size: first dimension of matrix A.
|
||||
output_size: second dimension of matrix A.
|
||||
bias: If true, add bias
|
||||
gather_output: If true, call all-gether on output and make Y avaiable
|
||||
to all GPUs, otherwise, every GPU will have its output
|
||||
which is Y_i = XA_i
|
||||
init_method: method to initialize weights. Note that bias is always set
|
||||
to zero.
|
||||
stride: For the strided linear layers.
|
||||
keep_master_weight_for_test: This was added for testing and should be
|
||||
set to False. It returns the master weights
|
||||
used for initialization.
|
||||
"""
|
||||
def __init__(self, input_size, output_size, bias=True, gather_output=True,
|
||||
init_method=init.xavier_normal_, stride=1,
|
||||
keep_master_weight_for_test=False,
|
||||
pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0,
|
||||
LR_weight_rank=8, LR_mask_rank=8):
|
||||
super(ColumnParallelLinear, self).__init__()
|
||||
|
||||
# Keep input parameters
|
||||
self.input_size = input_size
|
||||
self.output_size = output_size
|
||||
self.gather_output = gather_output
|
||||
# Divide the weight matrix along the last dimension.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.output_size_per_partition = divide(output_size, world_size)
|
||||
|
||||
# Parameters.
|
||||
# Note: torch.nn.functional.linear performs XA^T + b and as a result
|
||||
# we allocate the transpose.
|
||||
self.weight = Parameter(torch.Tensor(self.output_size_per_partition,
|
||||
self.input_size))
|
||||
self.weight.model_parallel = True
|
||||
if bias:
|
||||
self.bias = Parameter(torch.Tensor(self.output_size_per_partition))
|
||||
self.bias.model_parallel = True
|
||||
# Always initialize bias to zero.
|
||||
with torch.no_grad():
|
||||
self.bias.zero_()
|
||||
else:
|
||||
self.register_parameter('bias', None)
|
||||
|
||||
# Initialize weight.
|
||||
self.master_weight = _initialize_affine_weight(
|
||||
self.weight, self.output_size, self.input_size,
|
||||
self.output_size_per_partition, 0, init_method,
|
||||
stride=stride, return_master_weight=keep_master_weight_for_test)
|
||||
# self.timers = SynchronizedWallClockTimer()
|
||||
|
||||
self.pruning_method = None
|
||||
|
||||
def init_mask(self):
|
||||
if self.pruning_mask_init == "constant":
|
||||
init.constant_(self.mask_scores, val=self.pruning_mask_scale)
|
||||
elif self.pruning_mask_init == "uniform":
|
||||
init.uniform_(self.mask_scores, a=-self.pruning_mask_scale, b=self.pruning_mask_scale)
|
||||
elif self.pruning_mask_init == "kaiming":
|
||||
init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5))
|
||||
|
||||
def load_mask(self, pruning_threshold, k_threshold=None):
|
||||
if self.pruning_method in ["finetune"]:
|
||||
# mask = TopKBinarizer.apply(self.mask_scores.cpu(), pruning_threshold, k_threshold)
|
||||
if k_threshold is not None:
|
||||
self.mask[self.mask_scores <= k_threshold] = 0
|
||||
else:
|
||||
_, idx = self.mask_scores.cpu().flatten().sort(descending=True)
|
||||
j = int(pruning_threshold * self.mask_scores.numel())
|
||||
flat_out = self.mask.flatten()
|
||||
flat_out[idx[j:]] = 0
|
||||
flat_out[idx[:j]] = 1
|
||||
|
||||
self.__setattr__("mask_scores", None)
|
||||
del self.mask_scores
|
||||
self.weight.data = self.weight.data*self.mask.data
|
||||
|
||||
# mask = TopKBinarizer.apply(self.mask_scores, pruning_threshold, k_threshold)
|
||||
# self.mask.data = mask.data
|
||||
# self.weight.data = self.weight.data*self.mask.data
|
||||
# del self.mask_scores
|
||||
# self.__setattr__("mask_scores", None)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def forward(self, input_, pruning_threshold=None):
|
||||
# Set up backprop all-reduce.
|
||||
input_parallel = copy_to_model_parallel_region(input_)
|
||||
|
||||
# Matrix multiply.
|
||||
if hasattr(self, 'linear'):
|
||||
output_parallel = self.linear(input_parallel, self.weight, self.bias)
|
||||
else:
|
||||
output_parallel = F.linear(input_parallel, self.weight, self.bias)
|
||||
|
||||
if self.gather_output:
|
||||
# All-gather across the partitions.
|
||||
output = gather_from_model_parallel_region(output_parallel)
|
||||
else:
|
||||
output = output_parallel
|
||||
return output
|
||||
|
||||
|
||||
class RowParallelLinear(torch.nn.Module):
|
||||
"""Linear layer with row parallelism.
|
||||
|
||||
The linear layer is defined as Y = XA + b. A is parallelized along
|
||||
its first dimension and X along its second dimension as:
|
||||
- -
|
||||
| A_1 |
|
||||
| . |
|
||||
A = | . | X = [X_1, ..., X_p]
|
||||
| . |
|
||||
| A_p |
|
||||
- -
|
||||
Arguments:
|
||||
input_size: first dimension of matrix A.
|
||||
output_size: second dimension of matrix A.
|
||||
bias: If true, add bias. Note that bias is not parallelized.
|
||||
input_is_parallel: If true, we assume that the input is already
|
||||
split across the GPUs and we do not split
|
||||
again.
|
||||
init_method: method to initialize weights. Note that bias is always set
|
||||
to zero.
|
||||
stride: For the strided linear layers.
|
||||
keep_master_weight_for_test: This was added for testing and should be
|
||||
set to False. It returns the master weights
|
||||
used for initialization.
|
||||
"""
|
||||
def __init__(self, input_size, output_size, bias=True,
|
||||
input_is_parallel=False,
|
||||
init_method=init.xavier_normal_, stride=1,
|
||||
keep_master_weight_for_test=False,
|
||||
pruning_method=None, pruning_mask_init='constant', pruning_mask_scale=0.0,
|
||||
LR_weight_rank=8, LR_mask_rank=8):
|
||||
super(RowParallelLinear, self).__init__()
|
||||
|
||||
# Keep input parameters
|
||||
self.input_size = input_size
|
||||
self.output_size = output_size
|
||||
self.input_is_parallel = input_is_parallel
|
||||
# Divide the weight matrix along the last dimension.
|
||||
world_size = get_model_parallel_world_size()
|
||||
self.input_size_per_partition = divide(input_size, world_size)
|
||||
|
||||
# Parameters.
|
||||
# Note: torch.nn.functional.linear performs XA^T + b and as a result
|
||||
# we allocate the transpose.
|
||||
self.weight = Parameter(torch.Tensor(self.output_size,
|
||||
self.input_size_per_partition))
|
||||
self.weight.model_parallel = True
|
||||
if bias:
|
||||
self.bias = Parameter(torch.Tensor(self.output_size))
|
||||
# Always initialize bias to zero.
|
||||
with torch.no_grad():
|
||||
self.bias.zero_()
|
||||
else:
|
||||
self.register_parameter('bias', None)
|
||||
|
||||
# Initialize weight.
|
||||
self.master_weight = _initialize_affine_weight(
|
||||
self.weight, self.output_size, self.input_size,
|
||||
self.input_size_per_partition, 1, init_method,
|
||||
stride=stride, return_master_weight=keep_master_weight_for_test)
|
||||
|
||||
self.pruning_method = None
|
||||
|
||||
def init_mask(self):
|
||||
if self.pruning_mask_init == "constant":
|
||||
init.constant_(self.mask_scores, val=self.pruning_mask_scale)
|
||||
elif self.pruning_mask_init == "uniform":
|
||||
init.uniform_(self.mask_scores, a=-self.pruning_mask_scale, b=self.pruning_mask_scale)
|
||||
elif self.pruning_mask_init == "kaiming":
|
||||
init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5))
|
||||
|
||||
def load_mask(self, pruning_threshold, k_threshold=None):
|
||||
if self.pruning_method in ["finetune"]:
|
||||
if k_threshold is not None:
|
||||
self.mask[self.mask_scores <= k_threshold] = 0
|
||||
else:
|
||||
_, idx = self.mask_scores.cpu().flatten().sort(descending=True)
|
||||
j = int(pruning_threshold * self.mask_scores.numel())
|
||||
flat_out = self.mask.flatten()
|
||||
flat_out[idx[j:]] = 0
|
||||
flat_out[idx[:j]] = 1
|
||||
|
||||
self.__setattr__("mask_scores", None)
|
||||
del self.mask_scores
|
||||
self.weight.data = self.weight.data*self.mask.data
|
||||
|
||||
# mask = TopKBinarizer.apply(self.mask_scores, pruning_threshold, k_threshold)
|
||||
# self.mask.data = mask.data
|
||||
# self.weight.data = self.weight.data*self.mask.data
|
||||
# del self.mask_scores
|
||||
# self.__setattr__("mask_scores", None)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def forward(self, input_, pruning_threshold=None):
|
||||
# Set up backprop all-reduce.
|
||||
if self.input_is_parallel:
|
||||
input_parallel = input_
|
||||
else:
|
||||
input_parallel = scatter_to_model_parallel_region(input_)
|
||||
|
||||
# Matrix multiply.
|
||||
if hasattr(self, 'linear'):
|
||||
output_parallel = self.linear(input_parallel, self.weight)
|
||||
else:
|
||||
output_parallel = F.linear(input_parallel, self.weight)
|
||||
|
||||
# All-reduce across all the partitions.
|
||||
output_ = reduce_from_model_parallel_region(output_parallel)
|
||||
if self.bias is not None:
|
||||
output = output_ + self.bias
|
||||
else:
|
||||
output = output_
|
||||
return output
|
||||
|
||||
165
modelscope/utils/nlp/mpu/mappings.py
Executable file
165
modelscope/utils/nlp/mpu/mappings.py
Executable file
@@ -0,0 +1,165 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
from .initialize import get_model_parallel_group
|
||||
from .utils import split_tensor_along_last_dim
|
||||
from deepspeed.utils.timer import SynchronizedWallClockTimer
|
||||
|
||||
def _reduce(input_):
|
||||
"""All-reduce the the input tensor across model parallel group."""
|
||||
group = get_model_parallel_group()
|
||||
|
||||
# Bypass the function if we are using only 1 GPU.
|
||||
if torch.distributed.get_world_size(group=group) == 1:
|
||||
return input_
|
||||
|
||||
# All-reduce.
|
||||
torch.distributed.all_reduce(input_, group=group)
|
||||
|
||||
return input_
|
||||
|
||||
|
||||
def _split(input_):
|
||||
"""Split the tensor along its last dimension and keep the
|
||||
corresponding slice."""
|
||||
group = get_model_parallel_group()
|
||||
|
||||
# Bypass the function if we are using only 1 GPU.
|
||||
if torch.distributed.get_world_size(group=group) == 1:
|
||||
return input_
|
||||
|
||||
# Split along last dimension.
|
||||
world_size = torch.distributed.get_world_size(group=group)
|
||||
input_list = split_tensor_along_last_dim(input_, world_size)
|
||||
|
||||
# Note: torch.split does not create contiguous tensors by default.
|
||||
rank = torch.distributed.get_rank(group=group)
|
||||
output = input_list[rank].contiguous()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def _gather(input_):
|
||||
"""Gather tensors and concatinate along the last dimension."""
|
||||
group = get_model_parallel_group()
|
||||
|
||||
# Bypass the function if we are using only 1 GPU.
|
||||
if torch.distributed.get_world_size(group=group) == 1:
|
||||
return input_
|
||||
|
||||
# Size and dimension.
|
||||
last_dim = input_.dim() - 1
|
||||
rank = torch.distributed.get_rank(group=group)
|
||||
world_size = torch.distributed.get_world_size(group=group)
|
||||
|
||||
tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
|
||||
tensor_list[rank] = input_
|
||||
torch.distributed.all_gather(tensor_list, input_, group=group)
|
||||
|
||||
# Note: torch.cat already creates a contiguous tensor.
|
||||
output = torch.cat(tensor_list, dim=last_dim).contiguous()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class _CopyToModelParallelRegion(torch.autograd.Function):
|
||||
"""Pass the input to the model parallel region."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
return input_
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
#timers = SynchronizedWallClockTimer()
|
||||
#timers('backward _Copy reduce').start()
|
||||
out = _reduce(grad_output)
|
||||
#timers('backward _Copy reduce').stop()
|
||||
#timers.log(names=['backward _Copy reduce'])
|
||||
return out
|
||||
#return _reduce(grad_output)
|
||||
|
||||
|
||||
class _ReduceFromModelParallelRegion(torch.autograd.Function):
|
||||
"""All-redcue the input from the model parallel region."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
#timers = SynchronizedWallClockTimer()
|
||||
#timers('forward _Reduce reduce').start()
|
||||
out = _reduce(input_)
|
||||
#timers('forward _Reduce reduce').stop()
|
||||
#timers.log(names=['forward _Reduce reduce'])
|
||||
return out
|
||||
#return _reduce(input_)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return grad_output
|
||||
|
||||
|
||||
class _ScatterToModelParallelRegion(torch.autograd.Function):
|
||||
"""Split the input and keep only the corresponding chuck to the rank."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
return _split(input_)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
#timers = SynchronizedWallClockTimer()
|
||||
#timers('backward _Scatter gather').start()
|
||||
out = _gather(grad_output)
|
||||
#timers('backward _Scatter gather').stop()
|
||||
#timers.log(names=['backward _Scatter gather'])
|
||||
return out
|
||||
#return _gather(grad_output)
|
||||
|
||||
|
||||
class _GatherFromModelParallelRegion(torch.autograd.Function):
|
||||
"""Gather the input from model parallel region and concatinate."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
#timers = SynchronizedWallClockTimer()
|
||||
#timers('forward _Gather gather').start()
|
||||
out = _gather(input_)
|
||||
#timers('forward _Gather gather').stop()
|
||||
#timers.log(names=['forward _Gather gather'])
|
||||
return out
|
||||
#return _gather(input_)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return _split(grad_output)
|
||||
|
||||
|
||||
# -----------------
|
||||
# Helper functions.
|
||||
# -----------------
|
||||
|
||||
def copy_to_model_parallel_region(input_):
|
||||
return _CopyToModelParallelRegion.apply(input_)
|
||||
|
||||
def reduce_from_model_parallel_region(input_):
|
||||
return _ReduceFromModelParallelRegion.apply(input_)
|
||||
|
||||
def scatter_to_model_parallel_region(input_):
|
||||
return _ScatterToModelParallelRegion.apply(input_)
|
||||
|
||||
def gather_from_model_parallel_region(input_):
|
||||
return _GatherFromModelParallelRegion.apply(input_)
|
||||
391
modelscope/utils/nlp/mpu/random.py
Executable file
391
modelscope/utils/nlp/mpu/random.py
Executable file
@@ -0,0 +1,391 @@
|
||||
# coding=utf-8
|
||||
#Modified by Samyam Rajbhandari
|
||||
#Used to partition the activations stored for backward propagation
|
||||
#Therefore reduces the memory consumption
|
||||
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# Parts of the code here are adapted from PyTorch
|
||||
# repo: https://github.com/pytorch/pytorch
|
||||
import contextlib
|
||||
import torch.distributed as dist
|
||||
import torch
|
||||
from torch import _C
|
||||
from torch.cuda import _lazy_call, device as device_ctx_manager
|
||||
#from torch.utils.checkpoint import detach_variable
|
||||
from ..utils import print_rank_0
|
||||
|
||||
|
||||
import torch.distributed as dist
|
||||
PARTITION_ACTIVATIONS = False
|
||||
PA_CORRECTNESS_TEST= False
|
||||
|
||||
def see_memory_usage(message, force=False):
|
||||
if not force:
|
||||
return
|
||||
dist.barrier()
|
||||
if dist.get_rank() == 0:
|
||||
print(message)
|
||||
print("Memory Allocated ", torch.cuda.memory_allocated()/(1024*1024*1024), "GigaBytes")
|
||||
print("Max Memory Allocated ", torch.cuda.max_memory_allocated()/(1024*1024*1024), "GigaBytes")
|
||||
print("Cache Allocated ", torch.cuda.memory_cached()/(1024*1024*1024), "GigaBytes")
|
||||
print("Max cache Allocated ", torch.cuda.max_memory_cached()/(1024*1024*1024), "GigaBytes")
|
||||
print(" ")
|
||||
#input("Press Any Key To Continue ..")
|
||||
|
||||
|
||||
from .initialize import get_data_parallel_rank
|
||||
from .initialize import get_model_parallel_rank
|
||||
from .initialize import get_model_parallel_world_size
|
||||
from .initialize import get_model_parallel_group
|
||||
|
||||
mp_rank = None #get_model_parallel_rank()
|
||||
mp_size = None #get_model_parallel_world_size()
|
||||
mp_group = None #get_model_parallel_group()
|
||||
|
||||
# Default name for the model parallel rng tracker.
|
||||
_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
|
||||
transport_stream = None
|
||||
cuda_device=None
|
||||
def detach_variable(inputs, device=None):
|
||||
if isinstance(inputs, tuple):
|
||||
out = []
|
||||
for inp in inputs:
|
||||
if not isinstance(inp, torch.Tensor):
|
||||
out.append(inp)
|
||||
continue
|
||||
|
||||
requires_grad = inp.requires_grad
|
||||
|
||||
if device is not None:
|
||||
x = inp.to(device=device)
|
||||
else:
|
||||
x = inp
|
||||
|
||||
x = x.detach()
|
||||
x.requires_grad = requires_grad
|
||||
out.append(x)
|
||||
return tuple(out)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
|
||||
|
||||
def _set_cuda_rng_state(new_state, device=-1):
|
||||
"""Sets the random number generator state of the current GPU.
|
||||
|
||||
Argumentss:
|
||||
new_state (torch.ByteTensor): The desired state
|
||||
This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
|
||||
with a single change: the input state is not cloned. Cloning caused
|
||||
major performance issues for +4 GPU cases.
|
||||
"""
|
||||
if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
|
||||
# older PyTorch
|
||||
def cb():
|
||||
with device_ctx_manager(device):
|
||||
_C._cuda_setRNGState(new_state)
|
||||
else:
|
||||
# newer PyTorch
|
||||
if device == -1:
|
||||
device = torch.device('cuda')
|
||||
elif isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
elif isinstance(device, int):
|
||||
device = torch.device('cuda', device)
|
||||
|
||||
def cb():
|
||||
idx = device.index
|
||||
if idx is None:
|
||||
idx = torch.cuda.current_device()
|
||||
default_generator = torch.cuda.default_generators[idx]
|
||||
default_generator.set_state(new_state)
|
||||
|
||||
_lazy_call(cb)
|
||||
|
||||
|
||||
|
||||
class CudaRNGStatesTracker:
|
||||
"""Tracker for the cuda RNG states.
|
||||
|
||||
Using the `add` method, a cuda rng state is initialized based on
|
||||
the input `seed` and is assigned to `name`. Later, by forking the
|
||||
rng state, we can perform operations and return to our starting
|
||||
cuda state.
|
||||
"""
|
||||
def __init__(self):
|
||||
# Map from a string name to the cuda rng state.
|
||||
self.states_ = {}
|
||||
# Seeds are just for book keeping and ensure no seed is set twice.
|
||||
self.seeds_ = set()
|
||||
|
||||
def reset(self):
|
||||
"""Set to the initial state (no tracker)."""
|
||||
self.states_ = {}
|
||||
self.seeds_ = set()
|
||||
|
||||
def get_states(self):
|
||||
"""Get rng states. Copy the dictionary so we have direct
|
||||
pointers to the states, not just a pointer to the dictionary."""
|
||||
states = {}
|
||||
for name in self.states_:
|
||||
states[name] = self.states_[name]
|
||||
return states
|
||||
|
||||
def set_states(self, states):
|
||||
"""Set the rng states. For efficiency purposes, we do not check
|
||||
the size of seed for compatibility."""
|
||||
self.states_ = states
|
||||
|
||||
def add(self, name, seed):
|
||||
"""Track the rng state."""
|
||||
# Check seed is not already used.
|
||||
if seed in self.seeds_:
|
||||
raise Exception('seed {} already exists'.format(seed))
|
||||
self.seeds_.add(seed)
|
||||
# Check that state is not already defined.
|
||||
if name in self.states_:
|
||||
raise Exception('cuda rng state {} already exists'.format(name))
|
||||
# Get the current rng state.
|
||||
orig_rng_state = torch.cuda.get_rng_state()
|
||||
# Set the new state and store it.
|
||||
torch.cuda.manual_seed(seed)
|
||||
self.states_[name] = torch.cuda.get_rng_state()
|
||||
# Reset rng state to what it was.
|
||||
_set_cuda_rng_state(orig_rng_state)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
|
||||
"""Fork the cuda rng state, perform operations, and exit with
|
||||
the original state."""
|
||||
# Check if we have added the state
|
||||
if name not in self.states_:
|
||||
raise Exception('cuda rng state {} is not added'.format(name))
|
||||
# Store current rng state.
|
||||
orig_cuda_rng_state = torch.cuda.get_rng_state()
|
||||
# Set rng state to the desired one
|
||||
_set_cuda_rng_state(self.states_[name])
|
||||
# Do the stuff we wanted to do.
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
# Update the current rng state for later use.
|
||||
self.states_[name] = torch.cuda.get_rng_state()
|
||||
# And set the state to the original state we started with.
|
||||
_set_cuda_rng_state(orig_cuda_rng_state)
|
||||
|
||||
|
||||
# RNG tracker object.
|
||||
_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
|
||||
|
||||
|
||||
def get_cuda_rng_tracker():
|
||||
"""Get cuda rng tracker."""
|
||||
return _CUDA_RNG_STATE_TRACKER
|
||||
|
||||
|
||||
def model_parallel_cuda_manual_seed(seed):
|
||||
"""Initialize model parallel cuda seed.
|
||||
|
||||
This function should be called after the model parallel is
|
||||
initialized. Also, no torch.cuda.manual_seed should be called
|
||||
after this function. Basically, this is replacement for that
|
||||
function.
|
||||
Two set of RNG states are tracked:
|
||||
default state: This is for data parallelism and is the same among a
|
||||
set of model parallel GPUs but different across
|
||||
different model paralle groups. This is used for
|
||||
example for dropout in the non-model-parallel regions.
|
||||
model-parallel state: This state is different among a set of model
|
||||
parallel GPUs, but the same across data parallel
|
||||
groups. This is used for example for dropout in
|
||||
model parallel regions.
|
||||
"""
|
||||
# 2718 is just for fun and any POSITIVE value will work.
|
||||
offset = seed + 2718
|
||||
model_parallel_seed = offset + get_model_parallel_rank()
|
||||
# Data parallel gets the original sedd.
|
||||
data_parallel_seed = seed
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> initializing model parallel cuda seeds on global rank {}, '
|
||||
'model parallel rank {}, and data parallel rank {} with '
|
||||
'model parallel seed: {} and data parallel seed: {}'.format(
|
||||
torch.distributed.get_rank(), get_model_parallel_rank(),
|
||||
get_data_parallel_rank(), model_parallel_seed,
|
||||
data_parallel_seed), flush=True)
|
||||
_CUDA_RNG_STATE_TRACKER.reset()
|
||||
# Set the default state.
|
||||
torch.cuda.manual_seed(data_parallel_seed)
|
||||
# and model parallel state.
|
||||
_CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
|
||||
model_parallel_seed)
|
||||
|
||||
|
||||
def get_partition_start(item):
|
||||
global mp_rank, mp_size, mp_group
|
||||
partition_size = get_partition_size(item)
|
||||
start = partition_size * mp_rank
|
||||
return int(start)
|
||||
|
||||
def get_partition_size(item):
|
||||
global mp_rank, mp_size, mp_group
|
||||
size = item.numel()
|
||||
partition_size = size/mp_size
|
||||
return int(partition_size)
|
||||
|
||||
def get_full_inputs(tensors):
|
||||
inputs=[]
|
||||
for i in range(int(len(tensors)/2)-1):
|
||||
item = tensors[2 * i]
|
||||
size = tensors[2* i + 1]
|
||||
partition_size = item.numel()
|
||||
tensor_size = partition_size * mp_size
|
||||
flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device)
|
||||
partitions=[]
|
||||
for i in range(mp_size):
|
||||
part_i = flat_tensor.narrow(0, partition_size * i , partition_size)
|
||||
if i == mp_rank:
|
||||
part_i.copy_(item)
|
||||
partitions.append(part_i)
|
||||
dist.all_gather(partitions,partitions[mp_rank], group=mp_group)
|
||||
input_tensor = flat_tensor.view(list(size.numpy()))
|
||||
item.data=input_tensor.data
|
||||
|
||||
inputs.append(item)
|
||||
inputs.append(tensors[-2])
|
||||
|
||||
return tuple(inputs)
|
||||
|
||||
|
||||
|
||||
class CheckpointFunction(torch.autograd.Function):
|
||||
"""This function is adapted from torch.utils.checkpoint with
|
||||
two main changes:
|
||||
1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
|
||||
2) the states in the model parallel tracker are also properly
|
||||
tracked/set/reset.
|
||||
"""
|
||||
@staticmethod
|
||||
def forward(ctx, run_function, *args):
|
||||
ctx.run_function = run_function
|
||||
global mp_rank, mp_size, mp_group
|
||||
if mp_rank is None:
|
||||
mp_rank = get_model_parallel_rank()
|
||||
mp_size = get_model_parallel_world_size()
|
||||
mp_group = get_model_parallel_group()
|
||||
|
||||
|
||||
global cuda_device, transport_stream, PARTITION_ACTIVATIONS
|
||||
if cuda_device is None:
|
||||
if dist.get_rank() == 0:
|
||||
print(f"Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}")
|
||||
|
||||
cuda_device = torch.cuda.current_device()
|
||||
#The transport stream is used to overlap the allgather communication for the activations
|
||||
#with the computation in the backward pass
|
||||
transport_stream = torch.cuda.Stream(device=cuda_device)
|
||||
|
||||
if PARTITION_ACTIVATIONS:
|
||||
#inputs = [item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), get_partition_size(item)).clone() for item in args[:-1]]
|
||||
#inputs.append(args[-1])
|
||||
print_rank_0("args: ", args)
|
||||
inputs = [item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), get_partition_size(item)).clone() for item in args[:2]]
|
||||
inputs.extend(args[2:])
|
||||
|
||||
|
||||
#just in case something funky is happening such as reuse of inputs
|
||||
inputs_cuda = [item.to(cuda_device) for item in args]
|
||||
|
||||
# Copy the rng states.
|
||||
ctx.fwd_cpu_rng_state = torch.get_rng_state()
|
||||
ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
|
||||
ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
|
||||
|
||||
#ctx.save_for_backward(*args)
|
||||
with torch.no_grad():
|
||||
outputs = run_function(*inputs_cuda)
|
||||
|
||||
del inputs_cuda
|
||||
|
||||
if PARTITION_ACTIVATIONS:
|
||||
new_args = []
|
||||
for arg, inp in zip(args,inputs):
|
||||
size= torch.tensor(arg.size())
|
||||
arg.data = inp.data
|
||||
new_args.append(arg)
|
||||
new_args.append(size)
|
||||
ctx.save_for_backward(*new_args)
|
||||
else:
|
||||
ctx.save_for_backward(*args)
|
||||
|
||||
return outputs
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, *args):
|
||||
if not torch.autograd._is_checkpoint_valid():
|
||||
raise RuntimeError("Checkpointing is not compatible with .grad(), "
|
||||
"please use .backward() if possible")
|
||||
|
||||
global cuda_device, transport_stream, PARTITION_ACTIVATIONS
|
||||
|
||||
if PARTITION_ACTIVATIONS:
|
||||
with torch.cuda.stream(transport_stream):
|
||||
inputs = get_full_inputs(ctx.saved_tensors)
|
||||
detached_inputs = detach_variable(inputs)
|
||||
else:
|
||||
inputs = ctx.saved_tensors
|
||||
detached_inputs = detach_variable(inputs)
|
||||
|
||||
# Store the current states.
|
||||
bwd_cpu_rng_state = torch.get_rng_state()
|
||||
bwd_cuda_rng_state = torch.cuda.get_rng_state()
|
||||
bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
|
||||
|
||||
# Set the states to what it used to be before the forward pass.
|
||||
torch.set_rng_state(ctx.fwd_cpu_rng_state)
|
||||
_set_cuda_rng_state(ctx.fwd_cuda_rng_state)
|
||||
get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
|
||||
|
||||
if PARTITION_ACTIVATIONS:
|
||||
current_stream=torch.cuda.current_stream()
|
||||
current_stream.wait_stream(transport_stream)
|
||||
|
||||
with torch.enable_grad():
|
||||
outputs = ctx.run_function(*detached_inputs)
|
||||
|
||||
# Set the states back to what it was at the start of this function.
|
||||
torch.set_rng_state(bwd_cpu_rng_state)
|
||||
_set_cuda_rng_state(bwd_cuda_rng_state)
|
||||
get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
|
||||
|
||||
if isinstance(outputs, torch.Tensor):
|
||||
outputs = (outputs,)
|
||||
torch.autograd.backward(outputs, args)
|
||||
return (None,) + tuple(inp.grad for inp in detached_inputs)
|
||||
|
||||
|
||||
def checkpoint(function, *args):
|
||||
"""Checkpoint a model or part of the model.
|
||||
This has been directly copied from torch.utils.checkpoint."""
|
||||
return CheckpointFunction.apply(function, *args)
|
||||
|
||||
def partition_activations_in_checkpoint(partition_activation):
|
||||
global PARTITION_ACTIVATIONS
|
||||
PARTITION_ACTIVATIONS=partition_activation
|
||||
if dist.get_rank() == 0:
|
||||
print(f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
|
||||
|
||||
|
||||
0
modelscope/utils/nlp/mpu/tests/__init__.py
Executable file
0
modelscope/utils/nlp/mpu/tests/__init__.py
Executable file
82
modelscope/utils/nlp/mpu/tests/commons.py
Executable file
82
modelscope/utils/nlp/mpu/tests/commons.py
Executable file
@@ -0,0 +1,82 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
import mpu
|
||||
|
||||
|
||||
class IdentityLayer(torch.nn.Module):
|
||||
def __init__(self, size, scale=1.0):
|
||||
super(IdentityLayer, self).__init__()
|
||||
self.weight = torch.nn.Parameter(scale * torch.randn(size))
|
||||
def forward(self):
|
||||
return self.weight
|
||||
|
||||
|
||||
def set_random_seed(seed):
|
||||
"""Set random seed for reproducability."""
|
||||
random.seed(seed)
|
||||
numpy.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
mpu.model_parallel_cuda_manual_seed(seed)
|
||||
|
||||
|
||||
def initialize_distributed(backend='nccl'):
|
||||
"""Initialize torch.distributed."""
|
||||
# Get local rank in case it is provided.
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--local_rank', type=int, default=None,
|
||||
help='local rank passed from distributed launcher')
|
||||
args = parser.parse_args()
|
||||
local_rank = args.local_rank
|
||||
|
||||
# Get rank and world size.
|
||||
rank = int(os.getenv('RANK', '0'))
|
||||
world_size = int(os.getenv("WORLD_SIZE", '1'))
|
||||
|
||||
print('> initializing torch.distributed with local rank: {}, '
|
||||
'rank: {}, world size: {}'.format(local_rank, rank, world_size))
|
||||
|
||||
# Set the device id.
|
||||
device = rank % torch.cuda.device_count()
|
||||
if local_rank is not None:
|
||||
device = local_rank
|
||||
#torch.cuda.set_device(device)
|
||||
|
||||
# Call the init process.
|
||||
init_method = 'tcp://'
|
||||
master_ip = os.getenv('MASTER_ADDR', 'localhost')
|
||||
master_port = os.getenv('MASTER_PORT', '6000')
|
||||
init_method += master_ip + ':' + master_port
|
||||
torch.distributed.init_process_group(
|
||||
backend=backend,
|
||||
world_size=world_size,
|
||||
rank=rank,
|
||||
init_method=init_method)
|
||||
|
||||
|
||||
def print_separator(message):
|
||||
torch.distributed.barrier()
|
||||
filler_len = (78 - len(message)) // 2
|
||||
filler = '-' * filler_len
|
||||
string = '\n' + filler + ' {} '.format(message) + filler
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(string, flush=True)
|
||||
torch.distributed.barrier()
|
||||
110
modelscope/utils/nlp/mpu/tests/test_cross_entropy.py
Executable file
110
modelscope/utils/nlp/mpu/tests/test_cross_entropy.py
Executable file
@@ -0,0 +1,110 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import sys
|
||||
sys.path.append("../..")
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import mpu
|
||||
from mpu.cross_entropy import vocab_parallel_cross_entropy
|
||||
|
||||
from commons import initialize_distributed
|
||||
from commons import print_separator
|
||||
from commons import IdentityLayer
|
||||
from commons import set_random_seed
|
||||
|
||||
|
||||
def torch_cross_entropy(batch_size, seq_length, vocab_size,
|
||||
logits_scale, seed):
|
||||
set_random_seed(seed)
|
||||
identity = IdentityLayer((batch_size, seq_length, vocab_size),
|
||||
scale=logits_scale).cuda()
|
||||
logits = identity()
|
||||
target = torch.cuda.LongTensor(
|
||||
size=(batch_size, seq_length)).random_(0, vocab_size)
|
||||
loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
|
||||
target.view(-1),
|
||||
reduction='none').view_as(target).mean()
|
||||
loss.backward()
|
||||
return loss, identity.weight.grad
|
||||
|
||||
|
||||
def mpu_cross_entropy(batch_size, seq_length, vocab_size,
|
||||
logits_scale, seed):
|
||||
set_random_seed(seed)
|
||||
identity = IdentityLayer((batch_size, seq_length, vocab_size),
|
||||
scale=logits_scale).cuda()
|
||||
logits = identity()
|
||||
logits_parallel = mpu.scatter_to_model_parallel_region(logits)
|
||||
target = torch.cuda.LongTensor(
|
||||
size=(batch_size, seq_length)).random_(0, vocab_size)
|
||||
loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
|
||||
loss.backward()
|
||||
return loss, identity.weight.grad
|
||||
|
||||
|
||||
def test_cross_entropy(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing cross entropy with model parallel size {} ...'.
|
||||
format(model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
batch_size = 13
|
||||
seq_length = 17
|
||||
vocab_size_per_partition = 11
|
||||
logits_scale = 1000.0
|
||||
vocab_size = vocab_size_per_partition * model_parallel_size
|
||||
seed = 1234
|
||||
|
||||
loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
|
||||
vocab_size, logits_scale,
|
||||
seed)
|
||||
loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
|
||||
vocab_size, logits_scale,
|
||||
seed)
|
||||
|
||||
error = loss_torch.sub_(loss_mpu).abs().max()
|
||||
print(' max error in loss on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
error = grad_torch.sub_(grad_mpu).abs().max()
|
||||
print(' max error in grad on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test cross entropy')
|
||||
test_cross_entropy(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
92
modelscope/utils/nlp/mpu/tests/test_data.py
Executable file
92
modelscope/utils/nlp/mpu/tests/test_data.py
Executable file
@@ -0,0 +1,92 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
import operator
|
||||
import sys
|
||||
sys.path.append("../..")
|
||||
|
||||
import torch
|
||||
import mpu
|
||||
from mpu import data as data_utils
|
||||
|
||||
from commons import initialize_distributed
|
||||
from commons import print_separator
|
||||
|
||||
|
||||
def test_boradcast_data(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing boradcast_data with model parallel size {} ...'.
|
||||
format(model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
torch.manual_seed(1234 + mpu.get_data_parallel_rank())
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
key_size_t = {'key1': [7, 11],
|
||||
'key2': [8, 2, 1],
|
||||
'key3': [13],
|
||||
'key4': [5, 1, 2],
|
||||
'key5': [5, 12]}
|
||||
keys = list(key_size_t.keys())
|
||||
|
||||
data = {}
|
||||
data_t = {}
|
||||
for key in key_size_t:
|
||||
data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
|
||||
data_t[key] = data[key].clone()
|
||||
data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
|
||||
data_t['keyX'] = data['keyX'].clone()
|
||||
if mpu.get_model_parallel_rank() != 0:
|
||||
data = None
|
||||
|
||||
data_utils._check_data_types(keys, data_t, torch.int64)
|
||||
key_size, key_numel, \
|
||||
total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
|
||||
for key in keys:
|
||||
assert key_size[key] == key_size_t[key]
|
||||
total_numel_t = 0
|
||||
for key in keys:
|
||||
target_size = functools.reduce(operator.mul, key_size_t[key], 1)
|
||||
assert key_numel[key] == target_size
|
||||
total_numel_t += target_size
|
||||
assert total_numel == total_numel_t
|
||||
|
||||
data_b = data_utils.broadcast_data(keys, data, torch.int64)
|
||||
for key in keys:
|
||||
tensor = data_t[key].cuda()
|
||||
assert data_b[key].sub(tensor).abs().max() == 0
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test test boradcast data')
|
||||
test_boradcast_data(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
|
||||
98
modelscope/utils/nlp/mpu/tests/test_initialize.py
Executable file
98
modelscope/utils/nlp/mpu/tests/test_initialize.py
Executable file
@@ -0,0 +1,98 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
sys.path.append("../..")
|
||||
|
||||
import torch
|
||||
import mpu
|
||||
|
||||
from commons import initialize_distributed
|
||||
from commons import print_separator
|
||||
|
||||
|
||||
def test_initialize_model_parallel(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing initialize_model_parallel with size {} ...'.format(
|
||||
model_parallel_size))
|
||||
model_parallel_size_ = min(model_parallel_size,
|
||||
torch.distributed.get_world_size())
|
||||
assert not mpu.model_parallel_is_initialized()
|
||||
mpu.initialize_model_parallel(model_parallel_size_)
|
||||
assert mpu.model_parallel_is_initialized()
|
||||
|
||||
# Checks.
|
||||
def check(group, world_size, rank):
|
||||
assert world_size == torch.distributed.get_world_size(group=group)
|
||||
assert rank == torch.distributed.get_rank(group=group)
|
||||
|
||||
# Model parallel.
|
||||
world_size = model_parallel_size_
|
||||
rank = torch.distributed.get_rank() % model_parallel_size_
|
||||
assert world_size == mpu.get_model_parallel_world_size()
|
||||
assert rank == mpu.get_model_parallel_rank()
|
||||
check(mpu.get_model_parallel_group(), world_size, rank)
|
||||
|
||||
|
||||
# Data parallel.
|
||||
world_size = torch.distributed.get_world_size() // model_parallel_size_
|
||||
rank = torch.distributed.get_rank() // model_parallel_size
|
||||
assert world_size == mpu.get_data_parallel_world_size()
|
||||
assert rank == mpu.get_data_parallel_rank()
|
||||
check(mpu.get_data_parallel_group(), world_size, rank)
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
def test_get_model_parallel_src_rank(model_parallel_size_):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing get_model_parallel_src_rank with size {} ...'.format(
|
||||
model_parallel_size_))
|
||||
model_parallel_size = min(model_parallel_size_,
|
||||
torch.distributed.get_world_size())
|
||||
assert not mpu.model_parallel_is_initialized()
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
assert mpu.model_parallel_is_initialized()
|
||||
|
||||
# Checks
|
||||
src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
|
||||
assert mpu.get_model_parallel_src_rank() == src_rank
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test initialize model parallel')
|
||||
test_initialize_model_parallel(model_parallel_size)
|
||||
print_separator('test model parallel source rank')
|
||||
test_get_model_parallel_src_rank(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
529
modelscope/utils/nlp/mpu/tests/test_layers.py
Executable file
529
modelscope/utils/nlp/mpu/tests/test_layers.py
Executable file
@@ -0,0 +1,529 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import sys
|
||||
sys.path.append("../..")
|
||||
|
||||
import torch
|
||||
import torch.nn.init as init
|
||||
from torch.nn.parameter import Parameter
|
||||
import mpu
|
||||
|
||||
from commons import initialize_distributed
|
||||
from commons import print_separator
|
||||
from commons import set_random_seed
|
||||
from mpu import layers
|
||||
|
||||
|
||||
def test_parallel_embedding(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing parallel embedding with model parallel size {} ...'.
|
||||
format(model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
batch_size = 17
|
||||
seq_length = 23
|
||||
vocab_size = 48
|
||||
hidden_size = 16
|
||||
seed = 1236
|
||||
|
||||
set_random_seed(123)
|
||||
input_data = torch.LongTensor(
|
||||
size=(batch_size,seq_length)).random_(0, vocab_size).cuda()
|
||||
loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
|
||||
|
||||
set_random_seed(seed)
|
||||
embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
|
||||
|
||||
output = embedding_original(input_data)
|
||||
loss_original = torch.mul(output, loss_weight).sum()
|
||||
loss_original.backward()
|
||||
|
||||
set_random_seed(seed)
|
||||
embedding_parallel = layers.ParallelEmbedding(
|
||||
vocab_size, hidden_size, init_method=init.normal_).cuda()
|
||||
output = embedding_parallel(input_data)
|
||||
loss_parallel = torch.mul(output, loss_weight).sum()
|
||||
loss_parallel.backward()
|
||||
|
||||
set_random_seed(seed)
|
||||
embedding_vocab_parallel = layers.VocabParallelEmbedding(
|
||||
vocab_size, hidden_size, init_method=init.normal_).cuda()
|
||||
output = embedding_vocab_parallel(input_data)
|
||||
loss_vocab_parallel = torch.mul(output, loss_weight).sum()
|
||||
loss_vocab_parallel.backward()
|
||||
|
||||
torch.distributed.barrier()
|
||||
error = loss_parallel.sub(loss_original).abs()
|
||||
print(' error in loss (parallel) on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-12, 'error: {}'.format(error)
|
||||
|
||||
torch.distributed.barrier()
|
||||
error = loss_vocab_parallel.sub(loss_original).abs()
|
||||
print(' error in loss (vocab parallel) on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-12, 'error: {}'.format(error)
|
||||
|
||||
weight_grad_orig = torch.split(embedding_original.weight.grad,
|
||||
hidden_size // model_parallel_size,
|
||||
1)[mpu.get_model_parallel_rank()]
|
||||
error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
|
||||
print(' error in grad (parallel) on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-12, 'error: {}'.format(error)
|
||||
|
||||
weight_grad_orig = torch.split(embedding_original.weight.grad,
|
||||
vocab_size // model_parallel_size,
|
||||
0)[mpu.get_model_parallel_rank()]
|
||||
error = embedding_vocab_parallel.weight.grad.sub(
|
||||
weight_grad_orig).abs().max()
|
||||
print(' error in grad (vocab parallel) on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-12, 'error: {}'.format(error)
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
def test_initialize_affine_weight(model_parallel_size):
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing initialize_affine_weight with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
input_size_coeff = 13
|
||||
input_size = input_size_coeff * model_parallel_size
|
||||
output_size_coeff = 17
|
||||
output_size = output_size_coeff * model_parallel_size
|
||||
|
||||
# ---------------
|
||||
# Column parallel
|
||||
# ---------------
|
||||
weight = torch.empty(output_size_coeff, input_size)
|
||||
set_random_seed(seed)
|
||||
layers._initialize_affine_weight(weight, output_size, input_size,
|
||||
|
||||
output_size_coeff, 0,
|
||||
torch.nn.init.normal_)
|
||||
# Target.
|
||||
set_random_seed(seed)
|
||||
master_weight = torch.empty(output_size, input_size)
|
||||
torch.nn.init.normal_(master_weight)
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
my_weight = torch.split(master_weight, output_size_coeff,
|
||||
dim=0)[rank].contiguous().clone()
|
||||
|
||||
# Compare.
|
||||
error = weight.sub(my_weight).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' column parallel max error (should be zero) on global rank '
|
||||
'{}: {}'.format(torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# ------------
|
||||
# Row parallel
|
||||
# ------------
|
||||
weight = torch.empty(output_size, input_size_coeff)
|
||||
set_random_seed(seed)
|
||||
mpu.layers._initialize_affine_weight(weight, output_size, input_size,
|
||||
input_size_coeff, 1,
|
||||
torch.nn.init.normal_)
|
||||
# Target.
|
||||
set_random_seed(seed)
|
||||
master_weight = torch.empty(output_size, input_size)
|
||||
torch.nn.init.normal_(master_weight)
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
my_weight = torch.split(master_weight, input_size_coeff,
|
||||
dim=1)[rank].contiguous().clone()
|
||||
|
||||
# Compare.
|
||||
error = weight.sub(my_weight).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' row parallel max error (should be zero) on global rank '
|
||||
'{}: {}'.format(torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
class IdentityLayer2D(torch.nn.Module):
|
||||
def __init__(self, m , n):
|
||||
super(IdentityLayer2D, self).__init__()
|
||||
self.weight = Parameter(torch.Tensor(m, n))
|
||||
torch.nn.init.xavier_normal_(self.weight)
|
||||
def forward(self):
|
||||
return self.weight
|
||||
|
||||
|
||||
def test_column_parallel_linear(model_parallel_size):
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing ColumnParallelLinear with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
set_random_seed(seed)
|
||||
input_size_coeff = 13
|
||||
input_size = input_size_coeff * model_parallel_size
|
||||
output_size_coeff = 17
|
||||
output_size = output_size_coeff * model_parallel_size
|
||||
batch_size = 7
|
||||
|
||||
# Network
|
||||
identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
|
||||
linear_layer = mpu.ColumnParallelLinear(
|
||||
input_size, output_size, keep_master_weight_for_test=True).cuda()
|
||||
loss_weight = torch.randn([batch_size, output_size]).cuda()
|
||||
# Forward
|
||||
input_ = identity_layer()
|
||||
output = linear_layer(input_)
|
||||
loss = torch.mul(output, loss_weight).sum()
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
# Values.
|
||||
dLdY = loss_weight
|
||||
X = identity_layer.weight
|
||||
A = linear_layer.master_weight.cuda()
|
||||
dLdA = torch.matmul(dLdY.t(), X)
|
||||
dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
|
||||
dLdX = torch.matmul(dLdY, A)
|
||||
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
my_dLdA = torch.split(dLdA, output_size_coeff,
|
||||
dim=0)[rank].contiguous().clone()
|
||||
error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdA on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
my_dLdb = torch.split(dLdb, output_size_coeff,
|
||||
dim=0)[rank].contiguous().clone()
|
||||
error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdb on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
error = dLdX.sub(identity_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdX on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
def test_row_parallel_linear(model_parallel_size):
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing RowParallelLinear with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
set_random_seed(seed)
|
||||
input_size_coeff = 13
|
||||
input_size = input_size_coeff * model_parallel_size
|
||||
output_size_coeff = 17
|
||||
output_size = output_size_coeff * model_parallel_size
|
||||
batch_size = 7
|
||||
|
||||
# Network
|
||||
identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
|
||||
linear_layer = mpu.RowParallelLinear(
|
||||
input_size, output_size, keep_master_weight_for_test=True).cuda()
|
||||
loss_weight = torch.randn([batch_size, output_size]).cuda()
|
||||
# Forward
|
||||
input_ = identity_layer()
|
||||
output = linear_layer(input_)
|
||||
loss = torch.mul(output, loss_weight).sum()
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
# Values.
|
||||
dLdY = loss_weight
|
||||
X = identity_layer.weight
|
||||
A = linear_layer.master_weight.cuda()
|
||||
dLdA = torch.matmul(dLdY.t(), X)
|
||||
dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
|
||||
dLdX = torch.matmul(dLdY, A)
|
||||
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
my_dLdA = torch.split(dLdA, input_size_coeff,
|
||||
dim=1)[rank].contiguous().clone()
|
||||
error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdA on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
error = dLdb.sub(linear_layer.bias.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdb on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
error = dLdX.sub(identity_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' error in dLdX on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
class IdentityLayer3D(torch.nn.Module):
|
||||
def __init__(self, m , n, k):
|
||||
super(IdentityLayer3D, self).__init__()
|
||||
self.weight = Parameter(torch.Tensor(m, n, k))
|
||||
torch.nn.init.xavier_normal_(self.weight)
|
||||
def forward(self):
|
||||
return self.weight
|
||||
|
||||
|
||||
def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, dropout_prob, batch_size,
|
||||
sequence_length):
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
set_random_seed(seed)
|
||||
|
||||
num_att_heads = num_att_heads_per_partition * \
|
||||
torch.distributed.get_world_size()
|
||||
hidden_size = hidden_size_per_att_head * num_att_heads
|
||||
|
||||
# Network
|
||||
identity_layer = IdentityLayer3D(batch_size, sequence_length,
|
||||
hidden_size).cuda()
|
||||
attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
|
||||
dropout_prob).cuda()
|
||||
loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
|
||||
attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
|
||||
# Forward
|
||||
input_ = identity_layer()
|
||||
output = attention_layer(input_, attention_mask)
|
||||
loss = torch.mul(output, loss_weight).sum()
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
mpu.destroy_model_parallel()
|
||||
return rank, hidden_size, model_parallel_size, loss, \
|
||||
attention_layer, identity_layer
|
||||
|
||||
|
||||
def test_parallel_self_attention(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing ParallelSelfAttention with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
|
||||
num_att_heads_per_partition = 3
|
||||
hidden_size_per_att_head = 7
|
||||
dropout_prob = 0.0 # has to be zero
|
||||
batch_size = 5
|
||||
sequence_length = 13
|
||||
|
||||
rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
|
||||
attention_layer_1, identity_layer_1 =parallel_self_attention(
|
||||
1, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
|
||||
|
||||
rank, hidden_size, model_parallel_size, loss, \
|
||||
attention_layer, identity_layer =parallel_self_attention(
|
||||
model_parallel_size, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
|
||||
assert hideen_size_1 == hidden_size
|
||||
|
||||
error = loss_1.sub(loss).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' loss error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-6
|
||||
|
||||
my_lin_grad_list = torch.split(
|
||||
attention_layer_1.query_key_value.weight.grad,
|
||||
hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
|
||||
my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
|
||||
error = my_lin_grad.sub(
|
||||
attention_layer.query_key_value.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' weight gradient error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-6
|
||||
|
||||
error = identity_layer_1.weight.grad.sub(
|
||||
identity_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' input gradient error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-6
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, batch_size, sequence_length):
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed = 12345
|
||||
set_random_seed(seed)
|
||||
|
||||
num_att_heads = num_att_heads_per_partition * \
|
||||
torch.distributed.get_world_size()
|
||||
hidden_size = hidden_size_per_att_head * num_att_heads
|
||||
intermediate_size = 4 * hidden_size
|
||||
|
||||
# Network
|
||||
identity_layer = IdentityLayer3D(batch_size, sequence_length,
|
||||
hidden_size).cuda()
|
||||
transformer_layer = mpu.BertParallelTransformerLayer(
|
||||
hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
|
||||
torch.nn.functional.relu, 1.0e-5).cuda()
|
||||
|
||||
loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
|
||||
attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
|
||||
# Forward
|
||||
input_ = identity_layer()
|
||||
output = transformer_layer(input_, attention_mask)
|
||||
loss = torch.mul(output, loss_weight).sum()
|
||||
# Backward
|
||||
loss.backward()
|
||||
|
||||
rank = mpu.get_model_parallel_rank()
|
||||
mpu.destroy_model_parallel()
|
||||
return rank, hidden_size, model_parallel_size, loss, \
|
||||
transformer_layer, identity_layer
|
||||
|
||||
|
||||
def test_parallel_transformer_layer(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing ParallelTransformerLayer with model parallel '
|
||||
'size: {}'.format(model_parallel_size))
|
||||
|
||||
num_att_heads_per_partition = 3
|
||||
hidden_size_per_att_head = 7
|
||||
batch_size = 5
|
||||
sequence_length = 13
|
||||
|
||||
rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
|
||||
transformer_layer_1, identity_layer_1 = parallel_transformer(
|
||||
1, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, batch_size, sequence_length)
|
||||
|
||||
rank, hidden_size, model_parallel_size, loss, \
|
||||
transformer_layer, identity_layer = parallel_transformer(
|
||||
model_parallel_size, num_att_heads_per_partition,
|
||||
hidden_size_per_att_head, batch_size, sequence_length)
|
||||
|
||||
error = loss_1.sub(loss).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' loss error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-5, 'error: {}'.format(error)
|
||||
|
||||
error = identity_layer_1.weight.grad.sub(
|
||||
identity_layer.weight.grad).abs().max()
|
||||
torch.distributed.barrier()
|
||||
print(' input gradient error on global rank {}: {}'.format(
|
||||
torch.distributed.get_rank(), error))
|
||||
assert error < 5.0e-5, 'error: {}'.format(error)
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(' >> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
print_separator('test initialize affine weight')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_initialize_affine_weight(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test parallel embedding')
|
||||
test_parallel_embedding(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
print_separator('test column-parallel linear')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_column_parallel_linear(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
print_separator('test row-parallel linear')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_row_parallel_linear(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
print_separator('test parallel self-attention')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_parallel_self_attention(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
print_separator('test parallel transformer')
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
test_parallel_transformer_layer(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
207
modelscope/utils/nlp/mpu/tests/test_random.py
Executable file
207
modelscope/utils/nlp/mpu/tests/test_random.py
Executable file
@@ -0,0 +1,207 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
sys.path.append("../..")
|
||||
|
||||
import torch
|
||||
import mpu
|
||||
|
||||
from commons import initialize_distributed
|
||||
from commons import print_separator
|
||||
|
||||
|
||||
def test_set_cuda_rng_state(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing set_rng_state with size {} ...'.
|
||||
format(model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
size = 123
|
||||
seed = 1234
|
||||
torch.cuda.manual_seed(1234)
|
||||
tensor = torch.cuda.FloatTensor(size)
|
||||
|
||||
# Get the state
|
||||
rng_state = torch.cuda.get_rng_state()
|
||||
rng_state_copy = rng_state.clone()
|
||||
|
||||
# Do some stuff.
|
||||
for _ in range(5):
|
||||
torch.randn(size, out=tensor)
|
||||
result_1 = tensor.clone()
|
||||
|
||||
assert rng_state.sub(rng_state_copy).max() == 0
|
||||
assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
|
||||
|
||||
# State should be different.
|
||||
new_rng_state = torch.cuda.get_rng_state()
|
||||
max_diff = new_rng_state.sub(rng_state).max()
|
||||
print(' max diff in rng state (should be non-zero) on global rank {}: {}'.
|
||||
format(torch.distributed.get_rank(), max_diff))
|
||||
assert max_diff > 0
|
||||
|
||||
# Reset the rng state and do the same stuff.
|
||||
mpu.random._set_cuda_rng_state(rng_state)
|
||||
for _ in range(5):
|
||||
torch.randn(size, out=tensor)
|
||||
mpu.random._set_cuda_rng_state(rng_state)
|
||||
for _ in range(5):
|
||||
torch.randn(size, out=tensor)
|
||||
result_2 = tensor.clone()
|
||||
|
||||
# Results should be the same
|
||||
error = result_2.sub(result_1).abs().max()
|
||||
print(' max error in generated tensors (should be zero) on '
|
||||
'global rank {}: {}'.format(torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Input state should have remained intact.
|
||||
error = rng_state.sub(rng_state_copy).max()
|
||||
print(' max error in rng state (should be zero) on global rank {}: {}'.
|
||||
format(torch.distributed.get_rank(), error))
|
||||
assert error == 0
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
def test_cuda_rng_tracker(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing cuda rng tracker with size {} ...'.
|
||||
format(model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
seed_1 = 1234
|
||||
seed_2 = 4321
|
||||
size = [12, 21]
|
||||
tensor = torch.cuda.FloatTensor(size)
|
||||
|
||||
# Set to seed_1 and generate two tensors.
|
||||
torch.cuda.manual_seed(seed_1)
|
||||
torch.randn(size, out=tensor)
|
||||
target_11 = tensor.clone()
|
||||
torch.randn(size, out=tensor)
|
||||
target_12 = tensor.clone()
|
||||
|
||||
# Set to seed_2 and generate two tensors.
|
||||
torch.cuda.manual_seed(seed_2)
|
||||
torch.randn(size, out=tensor)
|
||||
target_21 = tensor.clone()
|
||||
torch.randn(size, out=tensor)
|
||||
target_22 = tensor.clone()
|
||||
|
||||
# Now if we interleave seed_1 and seed_2,
|
||||
# we should still get the same tensors
|
||||
torch.cuda.manual_seed(seed_1)
|
||||
mpu.get_cuda_rng_tracker().add('test', seed_2)
|
||||
|
||||
torch.randn(size, out=tensor)
|
||||
result_11 = tensor.clone()
|
||||
|
||||
with mpu.get_cuda_rng_tracker().fork('test'):
|
||||
torch.randn(size, out=tensor)
|
||||
result_21 = tensor.clone()
|
||||
|
||||
torch.randn(size, out=tensor)
|
||||
result_12 = tensor.clone()
|
||||
|
||||
with mpu.get_cuda_rng_tracker().fork('test'):
|
||||
torch.randn(size, out=tensor)
|
||||
result_22 = tensor.clone()
|
||||
|
||||
diff = result_11.sub(result_21).abs().max()
|
||||
diff = min(diff, result_12.sub(result_22).abs().max())
|
||||
print(' max diff in generated tensors (should be non-zero) on '
|
||||
'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
|
||||
assert diff > 1.0e-6
|
||||
error = max(result_11.sub(target_11).abs().max(),
|
||||
result_12.sub(target_12).abs().max())
|
||||
error = max(error, result_21.sub(target_21).abs().max())
|
||||
error = max(error, result_22.sub(target_22).abs().max())
|
||||
print(' max error in generated tensors (should be zero) on '
|
||||
'global rank {}: {}'.format(torch.distributed.get_rank(), error))
|
||||
assert error < 1.0e-6
|
||||
|
||||
# Reset the tracker
|
||||
mpu.get_cuda_rng_tracker().reset()
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
def test_model_parallel_cuda_manual_seed(model_parallel_size):
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('> testing model parallel cuda manual seed with size {} ...'.
|
||||
format(model_parallel_size))
|
||||
|
||||
mpu.initialize_model_parallel(model_parallel_size)
|
||||
model_parallel_size = mpu.get_model_parallel_world_size()
|
||||
|
||||
mpu.model_parallel_cuda_manual_seed(12345)
|
||||
assert torch.cuda.initial_seed() == 12345
|
||||
with mpu.get_cuda_rng_tracker().fork():
|
||||
assert torch.cuda.initial_seed() == (12345 + 2718 +
|
||||
mpu.get_model_parallel_rank())
|
||||
|
||||
# Reset the tracker
|
||||
mpu.get_cuda_rng_tracker().reset()
|
||||
|
||||
# Reset groups
|
||||
mpu.destroy_model_parallel()
|
||||
|
||||
torch.distributed.barrier()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print('>> passed the test :-)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
initialize_distributed()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test set rng state')
|
||||
test_set_cuda_rng_state(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test cuda rng tracker')
|
||||
test_cuda_rng_tracker(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
model_parallel_size = 1
|
||||
while model_parallel_size <= world_size:
|
||||
print_separator('test model parallel cuda manual seed')
|
||||
test_model_parallel_cuda_manual_seed(model_parallel_size)
|
||||
model_parallel_size *= 2
|
||||
|
||||
1065
modelscope/utils/nlp/mpu/transformer.py
Executable file
1065
modelscope/utils/nlp/mpu/transformer.py
Executable file
File diff suppressed because it is too large
Load Diff
70
modelscope/utils/nlp/mpu/utils.py
Executable file
70
modelscope/utils/nlp/mpu/utils.py
Executable file
@@ -0,0 +1,70 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def ensure_divisibility(numerator, denominator):
|
||||
"""Ensure that numerator is divisible by the denominator."""
|
||||
assert numerator % denominator == 0, '{} is not divisible by {}'.format(
|
||||
numerator, denominator)
|
||||
|
||||
|
||||
def divide(numerator, denominator):
|
||||
"""Ensure that numerator is divisible by the denominator and return
|
||||
the division value."""
|
||||
ensure_divisibility(numerator, denominator)
|
||||
return numerator // denominator
|
||||
|
||||
|
||||
def split_tensor_along_last_dim(tensor, num_partitions,
|
||||
contiguous_split_chunks=False):
|
||||
"""Split a tensor along its last dimension.
|
||||
Arguments:
|
||||
tensor: input tensor.
|
||||
num_partitions: number of partitions to split the tensor
|
||||
contiguous_split_chunks: If True, make each chunk contiguous
|
||||
in memory.
|
||||
"""
|
||||
# Get the size and dimension.
|
||||
last_dim = tensor.dim() - 1
|
||||
last_dim_size = divide(tensor.size()[last_dim], num_partitions)
|
||||
# Split.
|
||||
tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
|
||||
# Note: torch.split does not create contiguous tensors by default.
|
||||
if contiguous_split_chunks:
|
||||
return tuple(chunk.contiguous() for chunk in tensor_list)
|
||||
|
||||
return tensor_list
|
||||
|
||||
|
||||
class VocabUtility:
|
||||
"""Split the vocabulary into `world_size` chunks amd return the
|
||||
first and last index of the vocabulary belonging to the `rank`
|
||||
partition: Note that indecies in [fist, last)"""
|
||||
|
||||
@staticmethod
|
||||
def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
|
||||
rank, world_size):
|
||||
index_f = rank * per_partition_vocab_size
|
||||
index_l = index_f + per_partition_vocab_size
|
||||
return index_f, index_l
|
||||
|
||||
@staticmethod
|
||||
def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
|
||||
per_partition_vocab_size = divide(global_vocab_size, world_size)
|
||||
return VocabUtility.vocab_range_from_per_partition_vocab_size(
|
||||
per_partition_vocab_size, rank, world_size)
|
||||
109
modelscope/utils/nlp/utils.py
Normal file
109
modelscope/utils/nlp/utils.py
Normal file
@@ -0,0 +1,109 @@
|
||||
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
"""Utilities for logging and serialization"""
|
||||
|
||||
def get_log_constant(user_log):
|
||||
return '[user log]' if user_log else ''
|
||||
|
||||
def print_rank_0(message):
|
||||
if torch.distributed.is_initialized():
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(message, flush=True)
|
||||
else:
|
||||
print(message, flush=True)
|
||||
|
||||
|
||||
def print_args(args):
|
||||
"""Print arguments."""
|
||||
|
||||
print('arguments:', flush=True)
|
||||
for arg in vars(args):
|
||||
dots = '.' * (29 - len(arg))
|
||||
print(' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
|
||||
|
||||
|
||||
def report_memory(name):
|
||||
"""Simple GPU memory report."""
|
||||
|
||||
mega_bytes = 1024.0 * 1024.0
|
||||
string = name + ' memory (MB)'
|
||||
string += ' | allocated: {}'.format(
|
||||
torch.cuda.memory_allocated() / mega_bytes)
|
||||
string += ' | max allocated: {}'.format(
|
||||
torch.cuda.max_memory_allocated() / mega_bytes)
|
||||
string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes)
|
||||
string += ' | max cached: {}'.format(
|
||||
torch.cuda.max_memory_cached()/ mega_bytes)
|
||||
print_rank_0(string)
|
||||
|
||||
|
||||
class Timers:
|
||||
"""Group of timers."""
|
||||
|
||||
class Timer:
|
||||
"""Timer."""
|
||||
|
||||
def __init__(self, name):
|
||||
self.name_ = name
|
||||
self.elapsed_ = 0.0
|
||||
self.started_ = False
|
||||
self.start_time = time.time()
|
||||
|
||||
def start(self):
|
||||
"""Start the timer."""
|
||||
assert not self.started_, 'timer has already been started'
|
||||
torch.cuda.synchronize()
|
||||
self.start_time = time.time()
|
||||
self.started_ = True
|
||||
|
||||
def stop(self):
|
||||
"""Stop the timer."""
|
||||
assert self.started_, 'timer is not started'
|
||||
torch.cuda.synchronize()
|
||||
self.elapsed_ += (time.time() - self.start_time)
|
||||
self.started_ = False
|
||||
|
||||
def reset(self):
|
||||
"""Reset timer."""
|
||||
self.elapsed_ = 0.0
|
||||
self.started_ = False
|
||||
|
||||
def elapsed(self, reset=True):
|
||||
"""Calculate the elapsed time."""
|
||||
started_ = self.started_
|
||||
# If the timing in progress, end it first.
|
||||
if self.started_:
|
||||
self.stop()
|
||||
# Get the elapsed time.
|
||||
elapsed_ = self.elapsed_
|
||||
# Reset the elapsed time
|
||||
if reset:
|
||||
self.reset()
|
||||
# If timing was in progress, set it back.
|
||||
if started_:
|
||||
self.start()
|
||||
return elapsed_
|
||||
|
||||
def __init__(self):
|
||||
self.timers = {}
|
||||
|
||||
def __call__(self, name):
|
||||
if name not in self.timers:
|
||||
self.timers[name] = self.Timer(name)
|
||||
return self.timers[name]
|
||||
|
||||
def log(self, names, normalizer=1.0, reset=True):
|
||||
"""Log a group of timers."""
|
||||
assert normalizer > 0.0
|
||||
string = 'time (ms)'
|
||||
for name in names:
|
||||
elapsed_time = self.timers[name].elapsed(
|
||||
reset=reset) * 1000.0/ normalizer
|
||||
string += ' | {}: {:.2f}'.format(name, elapsed_time)
|
||||
print_rank_0(string)
|
||||
|
||||
@@ -50,13 +50,13 @@ def _init_dist_pytorch(backend: str, **kwargs) -> None:
|
||||
# rank = int(os.environ['RANK'])
|
||||
local_rank = int(os.environ['LOCAL_RANK'])
|
||||
|
||||
torch.cuda.set_device(local_rank)
|
||||
#torch.cuda.set_device(local_rank)
|
||||
dist.init_process_group(backend=backend, **kwargs)
|
||||
|
||||
|
||||
def _init_dist_mpi(backend: str, **kwargs) -> None:
|
||||
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
|
||||
torch.cuda.set_device(local_rank)
|
||||
#torch.cuda.set_device(local_rank)
|
||||
if 'MASTER_PORT' not in os.environ:
|
||||
# 29500 is torch.distributed default port
|
||||
os.environ['MASTER_PORT'] = '29500'
|
||||
@@ -82,7 +82,7 @@ def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None:
|
||||
ntasks = int(os.environ['SLURM_NTASKS'])
|
||||
node_list = os.environ['SLURM_NODELIST']
|
||||
num_gpus = torch.cuda.device_count()
|
||||
torch.cuda.set_device(proc_id % num_gpus)
|
||||
#torch.cuda.set_device(proc_id % num_gpus)
|
||||
addr = subprocess.getoutput(
|
||||
f'scontrol show hostname {node_list} | head -n1')
|
||||
# specify master port
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration
|
||||
from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration, PlugForTextGeneration
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.pipelines.nlp import TextGenerationPipeline
|
||||
from modelscope.preprocessors import TextGenerationPreprocessor
|
||||
@@ -34,83 +34,15 @@ class TextGenerationTest(unittest.TestCase):
|
||||
self.gpt3_large_model_id = 'damo/nlp_gpt3_text-generation_chinese-large'
|
||||
self.gpt3_input = '《故乡》。深蓝的天空中挂着一轮金黄的圆月,下面是海边的沙地,'
|
||||
|
||||
def run_pipeline_with_model_instance(self, model_id, input):
|
||||
model = Model.from_pretrained(model_id)
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
model.model_dir,
|
||||
model.tokenizer,
|
||||
first_sequence='sentence',
|
||||
second_sequence=None)
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
print(pipeline_ins(input))
|
||||
self.plug_model_id = 'damo/nlp_plug_text-generation_chinese'
|
||||
self.plug_input = '段誉轻挥折扇,摇了摇头,说'
|
||||
|
||||
def run_pipeline_with_model_id(self, model_id, input):
|
||||
pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
|
||||
print(pipeline_ins(input))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_palm_zh_with_model_name(self):
|
||||
self.run_pipeline_with_model_id(self.palm_model_id_zh,
|
||||
self.palm_input_zh)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_palm_en_with_model_name(self):
|
||||
self.run_pipeline_with_model_id(self.palm_model_id_en,
|
||||
self.palm_input_en)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_gpt_base_with_model_name(self):
|
||||
self.run_pipeline_with_model_id(self.gpt3_base_model_id,
|
||||
self.gpt3_input)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_gpt_large_with_model_name(self):
|
||||
self.run_pipeline_with_model_id(self.gpt3_large_model_id,
|
||||
self.gpt3_input)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_palm_zh_with_model_instance(self):
|
||||
self.run_pipeline_with_model_instance(self.palm_model_id_zh,
|
||||
self.palm_input_zh)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_palm_en_with_model_instance(self):
|
||||
self.run_pipeline_with_model_instance(self.palm_model_id_en,
|
||||
self.palm_input_en)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_gpt_base_with_model_instance(self):
|
||||
self.run_pipeline_with_model_instance(self.gpt3_base_model_id,
|
||||
self.gpt3_input)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_gpt_large_with_model_instance(self):
|
||||
self.run_pipeline_with_model_instance(self.gpt3_large_model_id,
|
||||
self.gpt3_input)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_palm(self):
|
||||
for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh),
|
||||
(self.palm_model_id_en, self.palm_input_en)):
|
||||
cache_path = snapshot_download(model_id)
|
||||
model = PalmForTextGeneration.from_pretrained(cache_path)
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
cache_path,
|
||||
model.tokenizer,
|
||||
first_sequence='sentence',
|
||||
second_sequence=None)
|
||||
pipeline1 = TextGenerationPipeline(model, preprocessor)
|
||||
pipeline2 = pipeline(
|
||||
Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
print(
|
||||
f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
|
||||
)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_gpt3(self):
|
||||
cache_path = snapshot_download(self.gpt3_base_model_id)
|
||||
model = GPT3ForTextGeneration(cache_path)
|
||||
def test_plug(self):
|
||||
import torch
|
||||
print("start_method", str(torch.multiprocessing.get_start_method(allow_none=True)))
|
||||
torch.multiprocessing.set_start_method("spawn")
|
||||
cache_path = "/home/suluyan.sly/model/plug_model"
|
||||
model = PlugForTextGeneration(cache_path)
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
cache_path,
|
||||
model.tokenizer,
|
||||
@@ -120,13 +52,102 @@ class TextGenerationTest(unittest.TestCase):
|
||||
pipeline2 = pipeline(
|
||||
Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
print(
|
||||
f'pipeline1: {pipeline1(self.gpt3_input)}\npipeline2: {pipeline2(self.gpt3_input)}'
|
||||
f'pipeline1: {pipeline1(self.plug_input)}\npipeline2: {pipeline2(self.plug_input)}'
|
||||
)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_with_default_model(self):
|
||||
pipeline_ins = pipeline(task=Tasks.text_generation)
|
||||
print(pipeline_ins(self.palm_input_zh))
|
||||
# def run_pipeline_with_model_instance(self, model_id, input):
|
||||
# model = Model.from_pretrained(model_id)
|
||||
# preprocessor = TextGenerationPreprocessor(
|
||||
# model.model_dir,
|
||||
# model.tokenizer,
|
||||
# first_sequence='sentence',
|
||||
# second_sequence=None)
|
||||
# pipeline_ins = pipeline(
|
||||
# task=Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
# print(pipeline_ins(input))
|
||||
|
||||
# def run_pipeline_with_model_id(self, model_id, input):
|
||||
# pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
|
||||
# print(pipeline_ins(input))
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
# def test_palm_zh_with_model_name(self):
|
||||
# self.run_pipeline_with_model_id(self.palm_model_id_zh,
|
||||
# self.palm_input_zh)
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
# def test_palm_en_with_model_name(self):
|
||||
# self.run_pipeline_with_model_id(self.palm_model_id_en,
|
||||
# self.palm_input_en)
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
# def test_gpt_base_with_model_name(self):
|
||||
# self.run_pipeline_with_model_id(self.gpt3_base_model_id,
|
||||
# self.gpt3_input)
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
# def test_gpt_large_with_model_name(self):
|
||||
# self.run_pipeline_with_model_id(self.gpt3_large_model_id,
|
||||
# self.gpt3_input)
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
# def test_palm_zh_with_model_instance(self):
|
||||
# self.run_pipeline_with_model_instance(self.palm_model_id_zh,
|
||||
# self.palm_input_zh)
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
# def test_palm_en_with_model_instance(self):
|
||||
# self.run_pipeline_with_model_instance(self.palm_model_id_en,
|
||||
# self.palm_input_en)
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
# def test_gpt_base_with_model_instance(self):
|
||||
# self.run_pipeline_with_model_instance(self.gpt3_base_model_id,
|
||||
# self.gpt3_input)
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
# def test_gpt_large_with_model_instance(self):
|
||||
# self.run_pipeline_with_model_instance(self.gpt3_large_model_id,
|
||||
# self.gpt3_input)
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
# def test_run_palm(self):
|
||||
# for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh),
|
||||
# (self.palm_model_id_en, self.palm_input_en)):
|
||||
# cache_path = snapshot_download(model_id)
|
||||
# model = PalmForTextGeneration.from_pretrained(cache_path)
|
||||
# preprocessor = TextGenerationPreprocessor(
|
||||
# cache_path,
|
||||
# model.tokenizer,
|
||||
# first_sequence='sentence',
|
||||
# second_sequence=None)
|
||||
# pipeline1 = TextGenerationPipeline(model, preprocessor)
|
||||
# pipeline2 = pipeline(
|
||||
# Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
# print(
|
||||
# f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
|
||||
# )
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
# def test_run_gpt3(self):
|
||||
# cache_path = snapshot_download(self.gpt3_base_model_id)
|
||||
# model = GPT3ForTextGeneration(cache_path)
|
||||
# preprocessor = TextGenerationPreprocessor(
|
||||
# cache_path,
|
||||
# model.tokenizer,
|
||||
# first_sequence='sentence',
|
||||
# second_sequence=None)
|
||||
# pipeline1 = TextGenerationPipeline(model, preprocessor)
|
||||
# pipeline2 = pipeline(
|
||||
# Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
# print(
|
||||
# f'pipeline1: {pipeline1(self.gpt3_input)}\npipeline2: {pipeline2(self.gpt3_input)}'
|
||||
# )
|
||||
|
||||
# @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
# def test_run_with_default_model(self):
|
||||
# pipeline_ins = pipeline(task=Tasks.text_generation)
|
||||
# print(pipeline_ins(self.palm_input_zh))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user