diff --git a/modelscope/models/nlp/gpt_moe/checkpointing.py b/modelscope/models/nlp/gpt_moe/checkpointing.py index d5980e8a..08e5aaef 100644 --- a/modelscope/models/nlp/gpt_moe/checkpointing.py +++ b/modelscope/models/nlp/gpt_moe/checkpointing.py @@ -38,7 +38,7 @@ def get_checkpoint_names(checkpoints_path, f'mp_rank_{tensor_rank:02d}') if num_experts[0] > 0: - model_name = common_path + '_model_states.pt' + model_name = os.path.join(common_path, 'model_rng.pt') optim_name = os.path.join( checkpoints_path, path_load_tag, f'expp_rank_{expp_rank}_mp_rank_{tensor_rank:02d}_optim_states.pt') diff --git a/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py b/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py index e8b36aca..9c238d6f 100644 --- a/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py +++ b/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py @@ -1151,7 +1151,8 @@ class DistributedGPTMoE(TorchModel): attention_mask=None, position_ids=None, labels=None, - prompt_length=None): + prompt_length=None, + is_pair=(False, )): outputs, *other_losses = self.dist_model( tokens,