Fix bug of amp and device_map (#397)

* fix amp * remove useless code * Fix bug
2025-12-16 16:27:45 +01:00 · 2023-07-25 19:28:00 +08:00
parent f03898626e
commit 0db3d1d53b
7 changed files with 12 additions and 13 deletions
--- a/examples/pytorch/baichuan/finetune_baichuan.py
+++ b/examples/pytorch/baichuan/finetune_baichuan.py
@@ -226,9 +226,7 @@ kwargs = dict(
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    seed=args.seed,
-    cfg_modify_fn=cfg_modify_fn,
-    # No placement for model, leave the model to `device_map`
-    device='cpu')
+    cfg_modify_fn=cfg_modify_fn)

 trainer: EpochBasedTrainer = build_trainer(
    name=args.trainer, default_args=kwargs)
--- a/examples/pytorch/chatglm6b/finetune.py
+++ b/examples/pytorch/chatglm6b/finetune.py
@@ -206,11 +206,8 @@ model_config['model']['prefix_projection'] = args.prefix_projection
 tokenizer = ChatGLMTokenizer.from_pretrained(model_dir, trust_remote_code=True)

 device_map_kwargs = {}
-device_kwargs = {}
 if args.use_lora != 0 and torch.cuda.device_count() > 1:
    device_map_kwargs['device_map'] = 'auto'
-    # No placement for model, leave the model to `device_map`
-    device_kwargs['device'] = 'cpu'
 model = Model.from_pretrained(
    model_dir, cfg_dict=model_config, **device_map_kwargs)

@@ -396,7 +393,6 @@ trainer = Seq2SeqTrainer(
    seed=args.seed,
    data_collator=data_collator,
    remove_unused_data=True,
-    cfg_modify_fn=cfg_modify_fn,
-    **device_kwargs)
+    cfg_modify_fn=cfg_modify_fn)
 trainer.tokenizer = tokenizer
 trainer.train()
--- a/examples/pytorch/llm_agent/baichuan_sft.ipynb
+++ b/examples/pytorch/llm_agent/baichuan_sft.ipynb
@@ -1698,7 +1698,6 @@
    "    eval_dataset=val_dataset,\n",
    "    remove_unused_data=True,\n",
    "    seed=42,\n",
-    "    device='cpu',  # No placement for model, leave the model to `device_map`\n",
    "    cfg_modify_fn=cfg_modify_fn,\n",
    ")\n",
    "\n",
--- a/examples/pytorch/llm_agent/chatglm2_sft.ipynb
+++ b/examples/pytorch/llm_agent/chatglm2_sft.ipynb
@@ -1797,7 +1797,6 @@
    "    eval_dataset=val_dataset,\n",
    "    remove_unused_data=True,\n",
    "    seed=42,\n",
-    "    device='cpu',  # No placement for model, leave the model to `device_map`\n",
    "    cfg_modify_fn=cfg_modify_fn,\n",
    ")\n",
    "\n",
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -144,7 +144,8 @@ class Pipeline(ABC):
            if not isinstance(model, torch.nn.Module):
                return
            model.eval()
-            if self.device_map is None:
+            from modelscope.utils.torch_utils import is_on_same_device
+            if is_on_same_device(model):
                model.to(self.device)

        if not self._model_prepare:
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -43,7 +43,8 @@ from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
 from modelscope.utils.torch_utils import (compile_model, get_dist_info,
                                          get_local_rank, init_dist, is_dist,
-                                          is_master, set_random_seed)
+                                          is_master, is_on_same_device,
+                                          set_random_seed)
 from ..swift import Swift
 from .base import BaseTrainer
 from .builder import TRAINERS
@@ -257,7 +258,7 @@ class EpochBasedTrainer(BaseTrainer):
            # If not working in parallel scenario, put model to device as a default logic.
            device_name = self.device if self.device is not None else 'gpu'
            self.device = create_device(device_name)
-            if self.device.type == 'cuda':
+            if self.device.type == 'cuda' and is_on_same_device(self.model):
                self.model.to(self.device)

        self.print_cfg()
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -354,3 +354,8 @@ def all_gather(data, group=None):
        data_list.append(pickle.loads(buffer))

    return data_list
+
+
+def is_on_same_device(model: torch.nn.Module) -> bool:
+    device_set = set(map(lambda p: p.device, model.parameters()))
+    return len(device_set) == 1