diff --git a/examples/pytorch/llm/llm_infer.py b/examples/pytorch/llm/llm_infer.py
index 614e3d36..d496186f 100644
--- a/examples/pytorch/llm/llm_infer.py
+++ b/examples/pytorch/llm/llm_infer.py
@@ -17,7 +17,7 @@ class InferArguments:
         default='lora', metadata={'choices': ['lora', 'full']})
     ckpt_path: str = '/path/to/your/iter_xxx.pth'
     eval_human: bool = False  # False: eval test_dataset
-    ignore_args_error: bool = True  # False: notebook compatibility
+    ignore_args_error: bool = False  # True: notebook compatibility
 
     dataset: str = field(
         default='alpaca-en,alpaca-zh',
@@ -96,7 +96,7 @@ def llm_infer(args: InferArguments) -> None:
             inference(input_ids, model, tokenizer, streamer, generation_config)
             print('-' * 80)
     else:
-        dataset = get_dataset(args.dataset)
+        dataset = get_dataset(args.dataset.split(','))
         _, test_dataset = process_dataset(dataset, args.dataset_test_size,
                                           args.dataset_sample,
                                           args.dataset_seed)
diff --git a/examples/pytorch/llm/llm_sft.py b/examples/pytorch/llm/llm_sft.py
index a7dabf77..3fad52bb 100644
--- a/examples/pytorch/llm/llm_sft.py
+++ b/examples/pytorch/llm/llm_sft.py
@@ -30,7 +30,7 @@ class SftArguments:
     # baichuan-7b: 'lora': 16G; 'full': 80G
     sft_type: str = field(
         default='lora', metadata={'choices': ['lora', 'full']})
-    ignore_args_error: bool = True  # False: notebook compatibility
+    ignore_args_error: bool = False  # True: notebook compatibility
 
     dataset: str = field(
         default='alpaca-en,alpaca-zh',
@@ -121,7 +121,7 @@ def llm_sft(args: SftArguments) -> None:
     logger.info(f'device: {_p.device}, dtype: {_p.dtype}')
 
     # ### Loading Dataset
-    dataset = get_dataset(args.dataset)
+    dataset = get_dataset(args.dataset.split(','))
     train_dataset, val_dataset = process_dataset(dataset,
                                                  args.dataset_test_size,
                                                  args.dataset_sample,
diff --git a/examples/pytorch/llm/utils/__init__.py b/examples/pytorch/llm/utils/__init__.py
index e4772c03..c5051a97 100644
--- a/examples/pytorch/llm/utils/__init__.py
+++ b/examples/pytorch/llm/utils/__init__.py
@@ -1,5 +1,3 @@
-from _parser import *
-
 from .dataset import *
 from .models import *
 from .utils import *
diff --git a/examples/pytorch/llm/utils/dataset.py b/examples/pytorch/llm/utils/dataset.py
index 3035ba78..619e3fbc 100644
--- a/examples/pytorch/llm/utils/dataset.py
+++ b/examples/pytorch/llm/utils/dataset.py
@@ -1,4 +1,4 @@
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import numpy as np
 from datasets import Dataset as HfDataset
@@ -62,8 +62,7 @@ DATASET_MAPPER = {
 }
 
 
-def get_dataset(dataset_names: str) -> HfDataset:
-    dataset_name_list = dataset_names.split(',')
+def get_dataset(dataset_name_list: List[str]) -> HfDataset:
     dataset_list = []
     for dataset_name in dataset_name_list:
         get_function = DATASET_MAPPER[dataset_name]
diff --git a/examples/pytorch/llm/utils/models.py b/examples/pytorch/llm/utils/models.py
index c95df561..9613581c 100644
--- a/examples/pytorch/llm/utils/models.py
+++ b/examples/pytorch/llm/utils/models.py
@@ -1,4 +1,6 @@
-from typing import NamedTuple
+import os
+import sys
+from typing import Any, Dict, NamedTuple, Optional
 
 import torch
 from torch import dtype as Dtype
@@ -10,22 +12,18 @@ from modelscope.models.nlp.chatglm2 import ChatGLM2Config, ChatGLM2Tokenizer
 logger = get_logger()
 
 
-def _add_special_token(tokenizer):
-    if tokenizer.eos_token_id is None:
-        tokenizer.eos_token_id = 2
-    if tokenizer.bos_token_id is None:
-        tokenizer.bos_token_id = 1
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = 0
-    logger.info(f'bos_token_id: {tokenizer.bos_token_id}, '
-                f'eos_token_id: {tokenizer.eos_token_id}, '
-                f'pad_token_id: {tokenizer.pad_token_id}')
+def _add_special_token(tokenizer, special_token_mapper: Dict[str,
+                                                             Any]) -> None:
+    for k, v in special_token_mapper:
+        setattr(tokenizer, k, v)
+    assert tokenizer.eos_token is not None
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
 
 
 def get_model_tokenizer_default(model_dir: str,
-                                load_model: bool = True,
-                                add_special_token: bool = True,
-                                torch_dtype: Dtype = torch.float16):
+                                torch_dtype: Dtype,
+                                load_model: bool = True):
     """load from an independent repository"""
     model_config = AutoConfig.from_pretrained(
         model_dir, trust_remote_code=True)
@@ -41,16 +39,12 @@ def get_model_tokenizer_default(model_dir: str,
             device_map='auto',
             torch_dtype=torch_dtype,
             trust_remote_code=True)
-
-    if add_special_token:
-        _add_special_token(tokenizer)
     return model, tokenizer
 
 
 def get_model_tokenizer_chatglm2(model_dir: str,
-                                 load_model: bool = True,
-                                 add_special_token: bool = True,
-                                 torch_dtype: Dtype = torch.float16):
+                                 torch_dtype: Dtype,
+                                 load_model: bool = True):
     """load from ms library"""
     config = read_config(model_dir)
     logger.info(config)
@@ -66,8 +60,6 @@ def get_model_tokenizer_chatglm2(model_dir: str,
             config=model_config,
             device_map='auto',
             torch_dtype=torch_dtype)
-    if add_special_token:
-        _add_special_token(tokenizer)
     return model, tokenizer
 
 
@@ -79,18 +71,21 @@ class LoRATM(NamedTuple):
 
 
 # Reference: 'https://modelscope.cn/models/{model_id}/summary'
+# keys: 'model_id', 'revision', 'torch_dtype', 'get_function',
+#   'ignore_file_pattern', 'special_token_mapper', 'lora_TM'
 MODEL_MAPPER = {
     'baichuan-7b': {
-        'model_id': 'baichuan-inc/baichuan-7B',
+        'model_id': 'baichuan-inc/baichuan-7B',  # model id or model dir
         'revision': 'v1.0.7',
         'lora_TM': LoRATM.baichuan
     },
     'baichuan-13b': {
         'model_id': 'baichuan-inc/Baichuan-13B-Base',
         'revision': 'v1.0.3',
+        'torch_dtype': torch.bfloat16,
         'lora_TM': LoRATM.baichuan
     },
-    'chatglm2': {
+    'chatglm2-6b': {
         'model_id': 'ZhipuAI/chatglm2-6b',
         'revision': 'v1.0.6',
         'get_function': get_model_tokenizer_chatglm2,
@@ -116,18 +111,25 @@ MODEL_MAPPER = {
 
 
 def get_model_tokenizer(model_type: str,
-                        load_model: bool = True,
-                        add_special_token: bool = True,
-                        torch_dtype: Dtype = torch.float16):
+                        torch_dtype: Optional[Dtype] = None,
+                        load_model: bool = True):
     data = MODEL_MAPPER.get(model_type)
     if data is None:
         raise ValueError(f'model_type: {model_type}')
+
     model_id = data['model_id']
-    revision = data.get('revision', 'master')
     get_function = data.get('get_function', get_model_tokenizer_default)
     ignore_file_pattern = data.get('ignore_file_pattern', [])
-    model_dir = snapshot_download(
-        model_id, revision, ignore_file_pattern=ignore_file_pattern)
-    model, tokenizer = get_function(model_dir, load_model, add_special_token,
-                                    torch_dtype)
+    special_token_mapper = data.get('special_token_mapper', {})
+    if torch_dtype is None:
+        torch_dtype = data.get('torch_dtype', torch.float16)
+
+    model_dir = model_id
+    if not os.path.exists(model_id):
+        revision = data.get('revision', 'master')
+        model_dir = snapshot_download(
+            model_id, revision, ignore_file_pattern=ignore_file_pattern)
+
+    model, tokenizer = get_function(model_dir, torch_dtype, load_model)
+    _add_special_token(tokenizer, special_token_mapper)
     return model, tokenizer, model_dir
diff --git a/examples/pytorch/llm/utils/utils.py b/examples/pytorch/llm/utils/utils.py
index 5b8ee163..3542c82a 100644
--- a/examples/pytorch/llm/utils/utils.py
+++ b/examples/pytorch/llm/utils/utils.py
@@ -7,7 +7,7 @@ import sys
 from dataclasses import dataclass, field
 from functools import partial
 from types import MethodType
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Counter, Dict, List, Optional, Tuple, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -152,10 +152,9 @@ def print_example(example: Dict[str, Any], tokenizer) -> None:
     print(f'[INPUT_IDS] {input_ids}')
     print(f'[INPUT] {tokenizer.decode(input_ids)}')
     print()
+    n_mask = Counter(labels)[-100]
     print(f'[LABLES_IDS] {labels}')
-    print(
-        f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}'
-    )
+    print(f'[LABLES] <-100 * {n_mask}>{tokenizer.decode(labels[n_mask:])}')
 
 
 def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
@@ -198,10 +197,10 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
     logger.info(''.join(s))
 
 
-def show_freeze_layers(model: Module, max_lines: int = 20) -> None:
+def show_freeze_layers(model: Module, max_lines: Optional[int] = 20) -> None:
     named_p = list(model.named_parameters())
     for i, (n, p) in enumerate(named_p):
-        if i >= max_lines:
+        if max_lines is not None and i >= max_lines:
             logger.info('...')
             break
         logger.info(f'{n}: requires_grad={p.requires_grad}')
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 147b80e9..0d27782b 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -611,10 +611,6 @@ def save_pretrained(model,
         raise Exception(
             'At least pass in one checkpoint name for saving method')
 
-    # Clean the folder from a previous save
-    if os.path.exists(target_folder):
-        rmtree(target_folder)
-
     # Single ckpt path, sharded ckpt logic will be added later
     output_ckpt_path = os.path.join(target_folder, save_checkpoint_name)
 
@@ -629,7 +625,8 @@ def save_pretrained(model,
         copytree(
             model.model_dir,
             target_folder,
-            ignore=ignore_patterns(*ignore_file_set))
+            ignore=ignore_patterns(*ignore_file_set),
+            dirs_exist_ok=True)
 
     # Save the ckpt to the save directory
     try:
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index 1a673458..fb2d1265 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -357,6 +357,5 @@ def all_gather(data, group=None):
 
 
 def is_on_same_device(model: torch.nn.Module) -> bool:
-    device_set = set(map(lambda p: p.device.type,
-                         model.parameters())) - {'cpu'}
+    device_set = set(str(p.device) for p in model.parameters()) - {'cpu'}
     return len(device_set) <= 1