Support lora for llama

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13080086

* support lora for llama

* update baichuan

* remove work_dir

* fixbug: 1. change ConfigDict to list when hooks key not in config 2. ignore all bin files when preparing output folder

* 1. support device_map 2. remove the operation of to float when using lora

* add inference file

* add comment

* support device_map
This commit is contained in:
hemu.zp
2023-06-29 22:05:34 +08:00
committed by wenmeng.zwm
parent cc0e7527d7
commit f4c90f2adf
16 changed files with 461 additions and 51 deletions

View File

@@ -0,0 +1,44 @@
{
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"total_num_steps": "auto",
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto"
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 10000000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}

View File

@@ -0,0 +1,235 @@
import os
import sys
from dataclasses import dataclass, field
from transformers import AutoModelForCausalLM, AutoTokenizer
from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
build_dataset_from_file, snapshot_download)
from modelscope.metainfo import Trainers
from modelscope.preprocessors import TextGenerationTransformersPreprocessor
from modelscope.swift import Swift
from modelscope.swift.lora import LoRAConfig
from modelscope.trainers import build_trainer
DEFAULT_PAD_TOKEN = '[PAD]'
DEFAULT_EOS_TOKEN = '</s>'
DEFAULT_BOS_TOKEN = '<s>'
DEFAULT_UNK_TOKEN = '<unk>'
@dataclass(init=False)
class TextGenerationArguments(TrainingArgs):
trainer: str = field(
default=Trainers.default, metadata={
'help': 'The trainer used',
})
src_txt: str = field(
default=None,
metadata={
'help': 'The source text key of preprocessor',
'cfg_node': 'preprocessor.src_txt'
})
tgt_txt: str = field(
default=None,
metadata={
'help': 'The target text key of preprocessor',
'cfg_node': 'preprocessor.tgt_txt'
})
preprocessor: str = field(
default=None,
metadata={
'help': 'The preprocessor type',
'cfg_node': 'preprocessor.type'
})
lr_scheduler: str = field(
default=None,
metadata={
'help': 'The lr scheduler type',
'cfg_node': 'train.lr_scheduler.type'
})
world_size: int = field(
default=None,
metadata={
'help': 'The parallel world size',
'cfg_node': 'megatron.world_size'
})
tensor_model_parallel_size: int = field(
default=None,
metadata={
'help': 'The tensor model parallel size',
'cfg_node': 'megatron.tensor_model_parallel_size'
})
use_megatron: bool = field(
default=None, metadata={
'help': 'Whether to use MegatronHook',
})
bf16: bool = field(
default=False,
metadata={
'help': 'Whether to use bf16',
'cfg_node': 'train.bf16'
})
deepspeed: str = field(
default=None,
metadata={
'help': 'The location of DeepSpeed json config file.',
})
T_max: int = field(
default=None,
metadata={
'help': 'The T_max for CosineAnnealingLR',
'cfg_node': 'train.lr_scheduler.T_max'
})
use_lora: int = field(
default=0,
metadata={'help': 'Whether to use lora to train the model.'},
)
lora_rank: int = field(
default=32,
metadata={'help': 'The lora rank'},
)
lora_alpha: int = field(
default=32,
metadata={'help': 'The lora alpha'},
)
lora_dropout: float = field(
default=0.05,
metadata={'help': 'The lora dropout'},
)
device_map: str = field(
default=None,
metadata={
'help': 'A map that specifies where each submodule should go.'
})
def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer,
model):
"""Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
if num_new_tokens > 0:
input_embeddings = model.get_input_embeddings().weight.data
output_embeddings = model.get_output_embeddings().weight.data
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
dim=0, keepdim=True)
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
dim=0, keepdim=True)
input_embeddings[-num_new_tokens:] = input_embeddings_avg
output_embeddings[-num_new_tokens:] = output_embeddings_avg
config, args = TextGenerationArguments().parse_cli().to_config()
print(config, args)
def cfg_modify_fn(cfg):
if args.use_model_config:
cfg.merge_from_dict(config)
else:
cfg = config
if 'hooks' not in cfg.train:
cfg.train['hooks'] = []
if args.use_megatron:
cfg.train.hooks.append({'type': 'MegatronHook'})
if args.deepspeed:
cfg.train.hooks.append({
'type': 'DeepspeedHook',
'config': args.deepspeed,
'save_zero_checkpoint': True,
'with_mpu': False,
})
return cfg
if args.dataset_json_file is None:
train_dataset = MsDataset.load(
args.train_dataset_name,
subset_name=args.train_subset_name,
split=args.train_split,
namespace=args.train_dataset_namespace)
validation_dataset = MsDataset.load(
args.val_dataset_name,
subset_name=args.val_subset_name,
split=args.val_split,
namespace=args.val_dataset_namespace)
else:
train_dataset, validation_dataset = build_dataset_from_file(
args.dataset_json_file)
model_dir = snapshot_download(args.model)
sys.path.append(model_dir)
model = AutoModelForCausalLM.from_pretrained(
model_dir, trust_remote_code=True, device_map=args.device_map)
cfg_file = os.path.join(model_dir, 'configuration.json')
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
special_tokens_dict = dict()
if tokenizer.pad_token is None or tokenizer.pad_token == '':
special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None or tokenizer.eos_token == '':
special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None or tokenizer.bos_token == '':
special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None or tokenizer.unk_token == '':
special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN
smart_tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
tokenizer=tokenizer,
model=model,
)
preprocessor = TextGenerationTransformersPreprocessor(
model_dir,
tokenizer=tokenizer,
src_txt=config.preprocessor.src_txt,
tgt_txt=config.preprocessor.tgt_txt)
if args.use_lora != 0:
lora_config = LoRAConfig(
replace_modules=['pack'],
rank=args.lora_rank,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout)
model = model.bfloat16()
Swift.prepare_model(model, lora_config)
kwargs = dict(
model=model,
cfg_file=cfg_file,
preprocessor=preprocessor,
train_dataset=train_dataset,
eval_dataset=validation_dataset,
seed=args.seed,
cfg_modify_fn=cfg_modify_fn,
# No placement for model, leave the model to `device_map`
device='cpu')
trainer: EpochBasedTrainer = build_trainer(
name=args.trainer, default_args=kwargs)
trainer.train()

View File

@@ -0,0 +1,21 @@
export PYTHONPATH=$PYTHONPATH:./
torchrun examples/pytorch/baichuan/finetune_baichuan.py \
--trainer 'text-generation-trainer' \
--work_dir './tmp' \
--model 'baichuan-inc/baichuan-7B' \
--train_dataset_name 'chinese-poetry-collection' \
--val_dataset_name 'chinese-poetry-collection' \
--train_split 'train' \
--val_split 'test' \
--src_txt 'text1' \
--tgt_txt 'text2' \
--max_epochs 1 \
--per_device_train_batch_size 8 \
--lr 2e-5 \
--lr_scheduler 'CosineAnnealingLR' \
--eval_strategy 'no' \
--bf16 1 \
--use_lora 1 \
--eval_metrics 'text-gen-metric' \
--T_max 1 \
--device_map 'auto' \

View File

@@ -192,7 +192,8 @@ if config['model']['type'] == 'chatglm6b':
model_config['model']['prefix_projection'] = args.prefix_projection
tokenizer = ChatGLMTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = Model.from_pretrained(model_dir, cfg_dict=model_config)
model = Model.from_pretrained(
model_dir, cfg_dict=model_config, device_map='auto')
if args.ptuning_checkpoint is not None:
# Evaluation
@@ -213,7 +214,7 @@ if args.pre_seq_len is not None:
# P-tuning v2
model = model.half()
model.transformer.prefix_encoder.float()
else:
elif not args.use_lora:
# Finetune
model = model.float()
@@ -367,6 +368,8 @@ model.gradient_checkpointing_enable()
if config['model']['type'] == 'chatglm6b':
model.enable_input_require_grads()
# import torch
# model = torch.nn.DataParallel(model).cuda()
trainer = Seq2SeqTrainer(
model=model,
cfg_file='./configuration.json',
@@ -375,6 +378,8 @@ trainer = Seq2SeqTrainer(
seed=args.seed,
data_collator=data_collator,
remove_unused_data=True,
# No placement for model, leave the model to `device_map`
device='cpu',
cfg_modify_fn=cfg_modify_fn)
trainer.tokenizer = tokenizer
trainer.train()

View File

@@ -11,17 +11,17 @@ lora_config = LoRAConfig(
lora_dropout=0.05,
pretrained_weights='./lora_dureader_target/iter_600.pth')
model_dir = 'ZhipuAI/chatglm2-6b'
model_dir = 'ZhipuAI/ChatGLM-6B'
model_config = read_config(model_dir)
model_config['model'] = ConfigDict({
'type': Models.chatglm2_6b,
'type': Models.chatglm_6b,
})
model = Model.from_pretrained(model_dir, cfg_dict=model_config)
model = model.bfloat16()
Swift.prepare_model(model, lora_config)
pipe = pipeline('chat', model, pipeline_name='chatglm2_6b-text-generation')
pipe = pipeline('chat', model, pipeline_name='chatglm6b-text-generation')
print(
pipe({

View File

@@ -0,0 +1,31 @@
from modelscope import Model, pipeline, read_config
from modelscope.metainfo import Models
from modelscope.swift import Swift
from modelscope.swift.lora import LoRAConfig
from modelscope.utils.config import ConfigDict
lora_config = LoRAConfig(
replace_modules=['attention.query_key_value'],
rank=32,
lora_alpha=32,
lora_dropout=0.05,
pretrained_weights='./lora_dureader_target/iter_600.pth')
model_dir = 'ZhipuAI/chatglm2-6b'
model_config = read_config(model_dir)
model_config['model'] = ConfigDict({
'type': Models.chatglm2_6b,
})
model = Model.from_pretrained(model_dir, cfg_dict=model_config)
model = model.bfloat16()
Swift.prepare_model(model, lora_config)
pipe = pipeline('chat', model, pipeline_name='chatglm2_6b-text-generation')
print(
pipe({
'text':
'纵使进入21世纪后我国教育水平有了明显进步高考的难度却依旧不容小觑高考被中国学生和家长定义为改变命运、改写人生脑重要考试为了这场考试学生和家长都付出了很多。',
'history': []
}))

View File

@@ -4,14 +4,10 @@
import copy
import logging
import os
import shutil
import tempfile
import unittest
from dataclasses import dataclass, field
import json
import torch
import utils
from modelscope import TrainingArgs
from modelscope.hub.snapshot_download import snapshot_download
@@ -19,6 +15,8 @@ from modelscope.metainfo import Trainers
from modelscope.models.nlp.llama import LlamaForTextGeneration, LlamaTokenizer
from modelscope.msdatasets.dataset_cls.custom_datasets.torch_custom_dataset import \
TorchCustomDataset
from modelscope.swift import Swift
from modelscope.swift.lora import LoRAConfig
from modelscope.trainers import build_trainer
IGNORE_INDEX = -100
@@ -54,11 +52,35 @@ class TextGenerationArguments(TrainingArgs):
'help': 'The location of DeepSpeed json config file.',
})
work_dir: str = field(
default=None, metadata={
'help': 'The location of work dir',
use_lora: int = field(
default=0,
metadata={'help': 'Whether to use lora to train the model.'},
)
lora_rank: int = field(
default=32,
metadata={'help': 'The lora rank'},
)
lora_alpha: int = field(
default=32,
metadata={'help': 'The lora alpha'},
)
lora_dropout: float = field(
default=0.05,
metadata={'help': 'The lora dropout'},
)
device_map: str = field(
default=None,
metadata={
'help': 'A map that specifies where each submodule should go.'
})
zero_stage: int = field(
default=None, metadata={'help': 'The stage of zero_optimization'})
def _tokenize_fn(strings, tokenizer):
"""Tokenize a list of strings."""
@@ -211,12 +233,15 @@ if __name__ == '__main__':
cfg.train.dataloader = {'batch_size_per_gpu': 4, 'workers_per_gpu': 1}
if 'hooks' not in cfg.train:
cfg.train['hooks'] = []
if args.deepspeed is not None:
cfg.train.hooks.append({
'type': 'DeepspeedHook',
'config': args.deepspeed,
'save_zero_checkpoint': True,
'with_mpu': False,
})
if args.zero_stage is not None:
cfg.train.hooks[-1]['zero_stage'] = args.zero_stage
cfg.preprocessor.sequence_length = 512
return cfg
@@ -225,7 +250,17 @@ if __name__ == '__main__':
args.model) else snapshot_download(args.model)
data_path = args.src_txt if args.src_txt else os.path.join(
model_path, 'alpaca_data.json')
model = LlamaForTextGeneration.from_pretrained(model_path)
model = LlamaForTextGeneration.from_pretrained(
model_path, device_map=args.device_map)
if args.use_lora != 0:
lora_config = LoRAConfig(
replace_modules=['q_proj', 'k_proj', 'v_proj'],
rank=args.lora_rank,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout)
model = model.bfloat16()
Swift.prepare_model(model, lora_config)
tokenizer = LlamaTokenizer.from_pretrained(
model_path,
@@ -234,9 +269,13 @@ if __name__ == '__main__':
)
special_tokens_dict = dict()
if tokenizer.pad_token is None or tokenizer.pad_token == '':
special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None or tokenizer.eos_token == '':
special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None or tokenizer.bos_token == '':
special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None or tokenizer.unk_token == '':
special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN
smart_tokenizer_and_embedding_resize(
@@ -263,7 +302,7 @@ if __name__ == '__main__':
trainer.train()
# prepare for inference
if int(os.environ.get('LOCAL_RANK', 0)) == 0:
if args.deepspeed and int(os.environ.get('LOCAL_RANK', 0)) == 0:
tokenizer.save_pretrained(os.path.join(args.work_dir, 'output'))
os.system(f'rm {args.work_dir}/output/pytorch_model*')
os.system(

View File

@@ -0,0 +1,11 @@
DATA_PARALLEL_SIZE=1
export PYTHONPATH=$PYTHONPATH:./
torchrun --nproc_per_node $DATA_PARALLEL_SIZE examples/pytorch/llama/finetune_llama.py \
--work_dir './tmp' \
--model 'skyline2006/llama-7b' \
--deepspeed 'default_offload_opt_param.json' \
--eval_interval 100 \
--use_lora 1 \
--zero_stage 2 \

View File

@@ -1267,6 +1267,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=-100)
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))

View File

@@ -1003,6 +1003,7 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=-100)
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))

View File

@@ -139,8 +139,15 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
model_type = None
if model_dir is not None:
model_type = get_model_type(model_dir)
if tokenizer is not None:
self.nlp_tokenizer = NLPTokenizer(tokenize_kwargs=kwargs)
self.nlp_tokenizer._tokenizer = tokenizer
else:
self.nlp_tokenizer = NLPTokenizerForRoberta(
model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
model_dir,
model_type,
use_fast=use_fast,
tokenize_kwargs=kwargs)
def decode(self, tokens, **kwargs):
"""Decode the tokens to real text.

View File

@@ -187,6 +187,7 @@ class CheckpointHook(Hook):
strategy=self.upload_strategy,
done=True)
wait_for_done(self.PUSH_TO_HUB_QUEUE_NAME)
if self.push_to_hub:
self.logger.info('Uploading models done.')
def _push_to_hub(self, trainer, prefix, output_dir, delete_dir=False):

View File

@@ -26,7 +26,6 @@ class CheckpointProcessor:
trainer: The trainer instance.
output_dir: The target folder used in inference.
"""
model = trainer.unwrap_module(trainer.model)
config = trainer.cfg
# override pipeline by tasks name after finetune done,
@@ -38,8 +37,7 @@ class CheckpointProcessor:
# TODO a temp fix to avoid pipeline_name and task mismatch
config['pipeline'] = {'type': config['task']}
self.copy_files_and_dump_config(trainer, output_dir, config,
self._bin_file(model))
self.copy_files_and_dump_config(trainer, output_dir, config, '*.bin')
@staticmethod
def copy_files_and_dump_config(trainer, output_dir, config, bin_file):

View File

@@ -144,7 +144,10 @@ class DeepspeedProcessor(CheckpointProcessor, LrSchedulerProcessor,
except (ImportError, AssertionError):
return ''
def get_bin_file(self):
def get_bin_filename(self, with_mpu=True):
if not with_mpu:
return 'pytorch_model.bin'
else:
mp_rank = mpu.get_tensor_model_parallel_rank()
rank = '{:02d}'.format(mp_rank)
return f'mp_rank_{rank}_model_states.pt'
@@ -163,13 +166,21 @@ class DeepspeedProcessor(CheckpointProcessor, LrSchedulerProcessor,
save_dir = os.path.dirname(checkpoint_path_prefix)
prefix = os.path.basename(checkpoint_path_prefix)
trainer.model.save_checkpoint(save_dir, prefix)
if not self.stage3_gather_16bit_weights_on_model_save:
return
bin_file = self.get_bin_file()
with_mpu = not mpu.is_unitialized()
bin_file = self.get_bin_filename(with_mpu)
src_file = os.path.join(checkpoint_path_prefix, bin_file)
if self.zero_stage == 3 or with_mpu:
trainer.model.save_checkpoint(save_dir, prefix)
else:
save_checkpoint(
model, src_file, None, None, meta=None, with_meta=False)
if self.zero_stage == 3:
return
if with_mpu:
dest_file = os.path.join(output_dir, self._BIN_FILE_DIR, bin_file)
else:
dest_file = os.path.join(output_dir, bin_file)
if os.path.isfile(dest_file):
os.unlink(dest_file)
@@ -214,7 +225,7 @@ class DeepspeedProcessor(CheckpointProcessor, LrSchedulerProcessor,
else:
# in eval or prediction
save_dir = checkpoint_path_prefix
bin_file = self.get_bin_file()
bin_file = self.get_bin_filename()
model_file = os.path.join(save_dir, bin_file)
checkpoint = torch.load(
model_file, map_location=lambda storage, loc: storage)
@@ -273,11 +284,16 @@ class DeepspeedHook(Hook):
config=None,
deepspeed_activation_checkpointing=True,
save_zero_checkpoint=False,
with_mpu=True):
with_mpu=True,
zero_stage=None):
self.save_zero_checkpoint = save_zero_checkpoint
self.deepspeed_activation_checkpointing = deepspeed_activation_checkpointing
self.with_mpu = with_mpu
self.deepspeed_config = config
if zero_stage is not None:
assert zero_stage in (0, 1, 2,
3), 'zero_stage must in (0, 1, 2, 3)!'
self.zero_stage = zero_stage
def register_processor(self, trainer):
processor = DeepspeedProcessor()
@@ -376,9 +392,9 @@ class DeepspeedHook(Hook):
optimizer, lr_scheduler = deepspeed_optim_sched(
trainer, ds_config, max_steps)
config = ds_config.config
self.processor.stage3_gather_16bit_weights_on_model_save = config[
'zero_optimization'].get(
'stage3_gather_16bit_weights_on_model_save', True)
if self.zero_stage is not None:
config['zero_optimization']['stage'] = self.zero_stage
self.processor.zero_stage = config['zero_optimization'].get('stage', 0)
trainer.model, trainer.optimizer, _, trainer.lr_scheduler = deepspeed.initialize(
model=trainer.model,

View File

@@ -35,7 +35,7 @@ class MpuProcessor(CheckpointProcessor):
except (ImportError, AssertionError):
return ''
def get_bin_file(self):
def get_bin_filename(self):
mp_rank = mpu.get_tensor_model_parallel_rank()
rank = '{:02d}'.format(mp_rank)
return f'mp_rank_{rank}_model_states.pt'
@@ -72,7 +72,7 @@ class MpuProcessor(CheckpointProcessor):
save_dir = os.path.dirname(checkpoint_path_prefix)
prefix = os.path.basename(checkpoint_path_prefix)
bin_file = self.get_bin_file()
bin_file = self.get_bin_filename()
prefix_bin_file = os.path.join(save_dir, prefix + '_' + bin_file)
save_checkpoint(model, prefix_bin_file, with_meta=False)
@@ -98,7 +98,7 @@ class MpuProcessor(CheckpointProcessor):
save_dir = os.path.dirname(checkpoint_path_prefix)
prefix = os.path.basename(checkpoint_path_prefix)
bin_file = self.get_bin_file()
bin_file = self.get_bin_filename()
absolute_file = os.path.join(save_dir, prefix + '_' + bin_file)
if os.path.isfile(absolute_file):
os.remove(absolute_file)
@@ -108,7 +108,7 @@ class MpuProcessor(CheckpointProcessor):
model = trainer.unwrap_module(trainer.model)
if os.path.isdir(checkpoint_path_prefix):
save_dir = checkpoint_path_prefix
bin_file = self.get_bin_file()
bin_file = self.get_bin_filename()
model_file = os.path.join(save_dir, bin_file)
load_checkpoint(model_file, model, None, None)
else:
@@ -119,7 +119,7 @@ class MpuProcessor(CheckpointProcessor):
save_dir = os.path.dirname(checkpoint_path_prefix)
prefix = os.path.basename(checkpoint_path_prefix)
bin_file = self.get_bin_file()
bin_file = self.get_bin_filename()
model_file = os.path.join(save_dir, prefix + '_' + bin_file)
load_checkpoint(model_file, model, None, None)

View File

@@ -232,7 +232,7 @@ class EpochBasedTrainer(BaseTrainer):
# A logic to fit the current code
# Put a DDPHook in if launcher is provided.
if 'hooks' not in self.cfg.train:
self.cfg.train['hooks'] = ConfigDict([])
self.cfg.train['hooks'] = []
self.cfg.train['hooks'].append({
'type': 'DDPHook',
'launcher': self.launcher