diff --git a/modelscope/models/nlp/gpt_moe/checkpointing.py b/modelscope/models/nlp/gpt_moe/checkpointing.py
index d5980e8a..08e5aaef 100644
--- a/modelscope/models/nlp/gpt_moe/checkpointing.py
+++ b/modelscope/models/nlp/gpt_moe/checkpointing.py
@@ -38,7 +38,7 @@ def get_checkpoint_names(checkpoints_path,
                                f'mp_rank_{tensor_rank:02d}')
 
     if num_experts[0] > 0:
-        model_name = common_path + '_model_states.pt'
+        model_name = os.path.join(common_path, 'model_rng.pt')
         optim_name = os.path.join(
             checkpoints_path, path_load_tag,
             f'expp_rank_{expp_rank}_mp_rank_{tensor_rank:02d}_optim_states.pt')
diff --git a/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py b/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py
index e8b36aca..9c238d6f 100644
--- a/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py
+++ b/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py
@@ -1151,7 +1151,8 @@ class DistributedGPTMoE(TorchModel):
                 attention_mask=None,
                 position_ids=None,
                 labels=None,
-                prompt_length=None):
+                prompt_length=None,
+                is_pair=(False, )):
 
         outputs, *other_losses = self.dist_model(
             tokens,