mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-16 16:27:45 +01:00
Fix bug of amp and device_map (#397)
* fix amp * remove useless code * Fix bug
This commit is contained in:
@@ -226,9 +226,7 @@ kwargs = dict(
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=validation_dataset,
|
||||
seed=args.seed,
|
||||
cfg_modify_fn=cfg_modify_fn,
|
||||
# No placement for model, leave the model to `device_map`
|
||||
device='cpu')
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
|
||||
trainer: EpochBasedTrainer = build_trainer(
|
||||
name=args.trainer, default_args=kwargs)
|
||||
|
||||
@@ -206,11 +206,8 @@ model_config['model']['prefix_projection'] = args.prefix_projection
|
||||
tokenizer = ChatGLMTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
||||
|
||||
device_map_kwargs = {}
|
||||
device_kwargs = {}
|
||||
if args.use_lora != 0 and torch.cuda.device_count() > 1:
|
||||
device_map_kwargs['device_map'] = 'auto'
|
||||
# No placement for model, leave the model to `device_map`
|
||||
device_kwargs['device'] = 'cpu'
|
||||
model = Model.from_pretrained(
|
||||
model_dir, cfg_dict=model_config, **device_map_kwargs)
|
||||
|
||||
@@ -396,7 +393,6 @@ trainer = Seq2SeqTrainer(
|
||||
seed=args.seed,
|
||||
data_collator=data_collator,
|
||||
remove_unused_data=True,
|
||||
cfg_modify_fn=cfg_modify_fn,
|
||||
**device_kwargs)
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
trainer.tokenizer = tokenizer
|
||||
trainer.train()
|
||||
|
||||
@@ -1698,7 +1698,6 @@
|
||||
" eval_dataset=val_dataset,\n",
|
||||
" remove_unused_data=True,\n",
|
||||
" seed=42,\n",
|
||||
" device='cpu', # No placement for model, leave the model to `device_map`\n",
|
||||
" cfg_modify_fn=cfg_modify_fn,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
|
||||
@@ -1797,7 +1797,6 @@
|
||||
" eval_dataset=val_dataset,\n",
|
||||
" remove_unused_data=True,\n",
|
||||
" seed=42,\n",
|
||||
" device='cpu', # No placement for model, leave the model to `device_map`\n",
|
||||
" cfg_modify_fn=cfg_modify_fn,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
|
||||
@@ -144,7 +144,8 @@ class Pipeline(ABC):
|
||||
if not isinstance(model, torch.nn.Module):
|
||||
return
|
||||
model.eval()
|
||||
if self.device_map is None:
|
||||
from modelscope.utils.torch_utils import is_on_same_device
|
||||
if is_on_same_device(model):
|
||||
model.to(self.device)
|
||||
|
||||
if not self._model_prepare:
|
||||
|
||||
@@ -43,7 +43,8 @@ from modelscope.utils.logger import get_logger
|
||||
from modelscope.utils.registry import build_from_cfg
|
||||
from modelscope.utils.torch_utils import (compile_model, get_dist_info,
|
||||
get_local_rank, init_dist, is_dist,
|
||||
is_master, set_random_seed)
|
||||
is_master, is_on_same_device,
|
||||
set_random_seed)
|
||||
from ..swift import Swift
|
||||
from .base import BaseTrainer
|
||||
from .builder import TRAINERS
|
||||
@@ -257,7 +258,7 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
# If not working in parallel scenario, put model to device as a default logic.
|
||||
device_name = self.device if self.device is not None else 'gpu'
|
||||
self.device = create_device(device_name)
|
||||
if self.device.type == 'cuda':
|
||||
if self.device.type == 'cuda' and is_on_same_device(self.model):
|
||||
self.model.to(self.device)
|
||||
|
||||
self.print_cfg()
|
||||
|
||||
@@ -354,3 +354,8 @@ def all_gather(data, group=None):
|
||||
data_list.append(pickle.loads(buffer))
|
||||
|
||||
return data_list
|
||||
|
||||
|
||||
def is_on_same_device(model: torch.nn.Module) -> bool:
|
||||
device_set = set(map(lambda p: p.device, model.parameters()))
|
||||
return len(device_set) == 1
|
||||
|
||||
Reference in New Issue
Block a user