From ba15de012fcc97461d0655bd817cb99d9dd6a2f8 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Mon, 6 Jan 2025 17:29:44 +0800
Subject: [PATCH 1/9] fix
 https://www.modelscope.cn/models/iic/nlp_structbert_address-parsing_chinese_base/feedback/issueDetail/20431
 (#1170)

---
 .../preprocessors/nlp/token_classification_preprocessor.py      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index b3ff9935..902dafca 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -416,6 +416,8 @@ class TokenClassificationTransformersPreprocessor(
         offset_mapping = []
         tokens = self.nlp_tokenizer.tokenizer.tokenize(text)
         offset = 0
+        if getattr(self.nlp_tokenizer.tokenizer, 'do_lower_case', False):
+            text = text.lower()
         for token in tokens:
             is_start = (token[:2] != '##')
             if is_start:

From 768f953b9db39f800581d2b4971ccd1b93b4997d Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingdachen@apache.org>
Date: Fri, 10 Jan 2025 13:48:44 +0800
Subject: [PATCH 2/9] fix path contatenation to be windows compatabile (#1176)

* fix path contatenation to be windows compatabile

* support dataset too

---------

Co-authored-by: Yingda Chen <yingda.chen@alibaba-inc.com>
---
 modelscope/hub/snapshot_download.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index 4510280b..2f7f4790 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -233,7 +233,7 @@ def _snapshot_download(
         if repo_type == REPO_TYPE_MODEL:
             directory = os.path.abspath(
                 local_dir) if local_dir is not None else os.path.join(
-                    system_cache, repo_id)
+                    system_cache, *repo_id.split('/'))
             print(f'Downloading Model to directory: {directory}')
             revision_detail = _api.get_valid_revision_detail(
                 repo_id, revision=revision, cookies=cookies)
@@ -294,7 +294,7 @@ def _snapshot_download(
         elif repo_type == REPO_TYPE_DATASET:
             directory = os.path.abspath(
                 local_dir) if local_dir else os.path.join(
-                    system_cache, 'datasets', repo_id)
+                    system_cache, 'datasets', *repo_id.split('/'))
             print(f'Downloading Dataset to directory: {directory}')
 
             group_or_owner, name = model_id_to_group_owner_name(repo_id)

From cc6fbbab5f409e174410b337f9743bd2cf3262ee Mon Sep 17 00:00:00 2001
From: suluyana <110878454+suluyana@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:38:05 +0800
Subject: [PATCH 3/9] logger.warning when using remote code (#1171)

* logger warning when using remote code

Co-authored-by: suluyan <suluyan.sly@alibaba-inc.com>
---
 modelscope/models/nlp/polylm/text_generation.py  |  6 ++++++
 modelscope/msdatasets/data_loader/data_loader.py |  5 +++++
 .../data_loader/data_loader_manager.py           |  9 +++++++++
 modelscope/msdatasets/ms_dataset.py              |  5 +++++
 modelscope/msdatasets/utils/hf_datasets_util.py  | 16 ++++++++++++++++
 modelscope/pipelines/accelerate/vllm.py          |  6 ++++++
 .../pipelines/multi_modal/ovis_vl_pipeline.py    |  7 ++++++-
 modelscope/pipelines/nlp/llm_pipeline.py         | 12 ++++++++++++
 .../pipelines/nlp/text_generation_pipeline.py    | 12 ++++++++++++
 modelscope/preprocessors/templates/loader.py     |  3 +++
 modelscope/utils/automodel_utils.py              |  6 ++++++
 modelscope/utils/plugins.py                      |  3 +++
 12 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/modelscope/models/nlp/polylm/text_generation.py b/modelscope/models/nlp/polylm/text_generation.py
index 1881cf2b..bd6fbd69 100644
--- a/modelscope/models/nlp/polylm/text_generation.py
+++ b/modelscope/models/nlp/polylm/text_generation.py
@@ -10,8 +10,11 @@ from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import read_config
+from modelscope.utils.logger import get_logger
 from modelscope.utils.streaming_output import StreamingOutputMixin
 
+logger = get_logger()
+
 __all__ = ['PolyLMForTextGeneration']
 
 
@@ -27,6 +30,9 @@ class PolyLMForTextGeneration(TorchModel, StreamingOutputMixin):
         super().__init__(model_dir, *args, **kwargs)
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_dir, legacy=False, use_fast=False)
+        logger.warning(
+            f'Use trust_remote_code=True. Will invoke codes from {model_dir}. Please make sure '
+            'that you can trust the external codes.')
         self.model = AutoModelForCausalLM.from_pretrained(
             model_dir, device_map='auto', trust_remote_code=True)
         self.model.eval()
diff --git a/modelscope/msdatasets/data_loader/data_loader.py b/modelscope/msdatasets/data_loader/data_loader.py
index 92074449..71fcee45 100644
--- a/modelscope/msdatasets/data_loader/data_loader.py
+++ b/modelscope/msdatasets/data_loader/data_loader.py
@@ -133,6 +133,11 @@ class OssDownloader(BaseDownloader):
             raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
 
         if dataset_py_script and dataset_formation == DatasetFormations.hf_compatible:
+            if trust_remote_code:
+                logger.warning(
+                    f'Use trust_remote_code=True. Will invoke codes from {dataset_name}. Please make '
+                    'sure that you can trust the external codes.')
+
             self.dataset = hf_load_dataset(
                 dataset_py_script,
                 name=subset_name,
diff --git a/modelscope/msdatasets/data_loader/data_loader_manager.py b/modelscope/msdatasets/data_loader/data_loader_manager.py
index a9e58b7c..b64a8926 100644
--- a/modelscope/msdatasets/data_loader/data_loader_manager.py
+++ b/modelscope/msdatasets/data_loader/data_loader_manager.py
@@ -71,6 +71,11 @@ class LocalDataLoaderManager(DataLoaderManager):
         # Select local data loader
         # TODO: more loaders to be supported.
         if data_loader_type == LocalDataLoaderType.HF_DATA_LOADER:
+            if trust_remote_code:
+                logger.warning(
+                    f'Use trust_remote_code=True. Will invoke codes from {dataset_name}. Please make '
+                    'sure that you can trust the external codes.')
+
             # Build huggingface data loader and return dataset.
             return hf_data_loader(
                 dataset_name,
@@ -110,6 +115,10 @@ class RemoteDataLoaderManager(DataLoaderManager):
 
         # To use the huggingface data loader
         if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER:
+            if trust_remote_code:
+                logger.warning(
+                    f'Use trust_remote_code=True. Will invoke codes from {dataset_name}. Please make '
+                    'sure that you can trust the external codes.')
             dataset_ret = hf_data_loader(
                 dataset_name,
                 name=subset_name,
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 899142ad..dbe15171 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -237,6 +237,11 @@ class MsDataset:
             if not namespace or not dataset_name:
                 raise 'The dataset_name should be in the form of `namespace/dataset_name` or `dataset_name`.'
 
+        if trust_remote_code:
+            logger.warning(
+                f'Use trust_remote_code=True. Will invoke codes from {dataset_name}. Please make sure that '
+                'you can trust the external codes.')
+
         # Init context config
         dataset_context_config = DatasetContextConfig(
             dataset_name=dataset_name,
diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py
index 4d6de81c..fea304f6 100644
--- a/modelscope/msdatasets/utils/hf_datasets_util.py
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
@@ -835,6 +835,8 @@ def get_module_with_script(self) -> DatasetModule:
     if not os.path.exists(importable_file_path):
         trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
         if trust_remote_code:
+            logger.warning(f'Use trust_remote_code=True. Will invoke codes from {repo_id}. Please make sure that '
+                           'you can trust the external codes.')
             _create_importable_file(
                 local_path=local_script_path,
                 local_imports=local_imports,
@@ -934,6 +936,11 @@ class DatasetsWrapperHF:
             verification_mode or VerificationMode.BASIC_CHECKS
         ) if not save_infos else VerificationMode.ALL_CHECKS)
 
+        if trust_remote_code:
+            logger.warning(f'Use trust_remote_code=True. Will invoke codes from {path}. Please make sure '
+                           'that you can trust the external codes.'
+                           )
+
         # Create a dataset builder
         builder_instance = DatasetsWrapperHF.load_dataset_builder(
             path=path,
@@ -1061,6 +1068,11 @@ class DatasetsWrapperHF:
             ) if download_config else DownloadConfig()
             download_config.storage_options.update(storage_options)
 
+        if trust_remote_code:
+            logger.warning(f'Use trust_remote_code=True. Will invoke codes from {path}. Please make sure '
+                           'that you can trust the external codes.'
+                           )
+
         dataset_module = DatasetsWrapperHF.dataset_module_factory(
             path,
             revision=revision,
@@ -1171,6 +1183,10 @@ class DatasetsWrapperHF:
         #   -> the module from the python file in the dataset repository
         # - if path has one "/" and is dataset repository on the HF hub without a python file
         #   -> use a packaged module (csv, text etc.) based on content of the repository
+        if trust_remote_code:
+            logger.warning(f'Use trust_remote_code=True. Will invoke codes from {path}. Please make sure '
+                           'that you can trust the external codes.'
+                           )
 
         # Try packaged
         if path in _PACKAGED_DATASETS_MODULES:
diff --git a/modelscope/pipelines/accelerate/vllm.py b/modelscope/pipelines/accelerate/vllm.py
index 15ced4bb..9cadb979 100644
--- a/modelscope/pipelines/accelerate/vllm.py
+++ b/modelscope/pipelines/accelerate/vllm.py
@@ -1,8 +1,11 @@
 from typing import List, Union
 
+from modelscope import get_logger
 from modelscope.pipelines.accelerate.base import InferFramework
 from modelscope.utils.import_utils import is_vllm_available
 
+logger = get_logger()
+
 
 class Vllm(InferFramework):
 
@@ -27,6 +30,9 @@ class Vllm(InferFramework):
         if not Vllm.check_gpu_compatibility(8) and (dtype
                                                     in ('bfloat16', 'auto')):
             dtype = 'float16'
+        logger.warning(
+            f'Use trust_remote_code=True. Will invoke codes from {self.model_dir}. Please make '
+            'sure that you can trust the external codes.')
         self.model = LLM(
             self.model_dir,
             dtype=dtype,
diff --git a/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py b/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py
index f19eddff..cdce09a1 100644
--- a/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py
+++ b/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py
@@ -2,7 +2,7 @@ from typing import Any, Dict, Union
 
 import torch
 
-from modelscope import AutoModelForCausalLM
+from modelscope import AutoModelForCausalLM, get_logger
 from modelscope.metainfo import Pipelines, Preprocessors
 from modelscope.models.base import Model
 from modelscope.outputs import OutputKeys
@@ -13,6 +13,8 @@ from modelscope.pipelines.multi_modal.visual_question_answering_pipeline import
 from modelscope.preprocessors import Preprocessor, load_image
 from modelscope.utils.constant import Fields, Frameworks, Tasks
 
+logger = get_logger()
+
 
 @PIPELINES.register_module(
     Tasks.visual_question_answering, module_name='ovis-vl')
@@ -35,6 +37,9 @@ class VisionChatPipeline(VisualQuestionAnsweringPipeline):
         torch_dtype = kwargs.get('torch_dtype', torch.float16)
         multimodal_max_length = kwargs.get('multimodal_max_length', 8192)
         self.device = 'cuda' if device == 'gpu' else device
+        logger.warning(
+            f'Use trust_remote_code=True. Will invoke codes from {model}. Please make '
+            'sure that you can trust the external codes.')
         self.model = AutoModelForCausalLM.from_pretrained(
             model,
             torch_dtype=torch_dtype,
diff --git a/modelscope/pipelines/nlp/llm_pipeline.py b/modelscope/pipelines/nlp/llm_pipeline.py
index 7ad0d278..269e8a42 100644
--- a/modelscope/pipelines/nlp/llm_pipeline.py
+++ b/modelscope/pipelines/nlp/llm_pipeline.py
@@ -97,6 +97,9 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
             assert base_model is not None, 'Cannot get adapter_cfg.model_id_or_path from configuration.json file.'
             revision = self.cfg.safe_get('adapter_cfg.model_revision',
                                          'master')
+            logger.warning(
+                f'Use trust_remote_code=True. Will invoke codes from {base_model}. Please make sure that you can '
+                'trust the external codes.')
             base_model = Model.from_pretrained(
                 base_model,
                 revision,
@@ -134,6 +137,9 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
                     model) else snapshot_download(model)
                 # TODO: Temporary use of AutoModelForCausalLM
                 # Need to be updated into a universal solution
+                logger.warning(
+                    f'Use trust_remote_code=True. Will invoke codes from {model_dir}. Please make sure '
+                    'that you can trust the external codes.')
                 model = AutoModelForCausalLM.from_pretrained(
                     model_dir,
                     device_map=self.device_map,
@@ -173,6 +179,9 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
         self.llm_framework = llm_framework
 
         if os.path.exists(kwargs['model']):
+            logger.warning(
+                f'Use trust_remote_code=True. Will invoke codes from {kwargs["model"]}. Please make sure '
+                'that you can trust the external codes.')
             config = AutoConfig.from_pretrained(
                 kwargs['model'], trust_remote_code=True)
             q_config = config.__dict__.get('quantization_config', None)
@@ -423,6 +432,9 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
             model_dir = self.model.model_dir
         if tokenizer_class is None:
             tokenizer_class = AutoTokenizer
+        logger.warning(
+            f'Use trust_remote_code=True. Will invoke codes from {model_dir}. Please make sure '
+            'that you can trust the external codes.')
         return tokenizer_class.from_pretrained(
             model_dir, trust_remote_code=True)
 
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 55eaf809..374381cf 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -269,6 +269,9 @@ class ChatGLM6bV2TextGenerationPipeline(Pipeline):
             if use_bf16:
                 default_torch_dtype = torch.bfloat16
             torch_dtype = kwargs.get('torch_dtype', default_torch_dtype)
+            logger.warning(
+                f'Use trust_remote_code=True. Will invoke codes from {model_dir}. Please make sure '
+                'that you can trust the external codes.')
             model = Model.from_pretrained(
                 model_dir,
                 trust_remote_code=True,
@@ -285,6 +288,9 @@ class ChatGLM6bV2TextGenerationPipeline(Pipeline):
 
         self.model = model
         self.model.eval()
+        logger.warning(
+            f'Use trust_remote_code=True. Will invoke codes from {self.model.model_dir}. Please '
+            'make sure that you can trust the external codes.')
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model.model_dir, trust_remote_code=True)
 
@@ -328,6 +334,9 @@ class QWenChatPipeline(Pipeline):
             bf16 = False
 
         if isinstance(model, str):
+            logger.warning(
+                f'Use trust_remote_code=True. Will invoke codes from {model}. Please make sure '
+                'that you can trust the external codes.')
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model, revision=revision, trust_remote_code=True)
             self.model = AutoModelForCausalLM.from_pretrained(
@@ -392,6 +401,9 @@ class QWenTextGenerationPipeline(Pipeline):
             bf16 = False
 
         if isinstance(model, str):
+            logger.warning(
+                f'Use trust_remote_code=True. Will invoke codes from {model}. Please make sure '
+                'that you can trust the external codes.')
             self.model = AutoModelForCausalLM.from_pretrained(
                 model,
                 device_map=device_map,
diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py
index 8943f25d..8dac8d41 100644
--- a/modelscope/preprocessors/templates/loader.py
+++ b/modelscope/preprocessors/templates/loader.py
@@ -820,6 +820,9 @@ class TemplateLoader:
                                 model_id,
                                 revision=kwargs.pop('revision', 'master'),
                                 ignore_file_pattern=ignore_file_pattern)
+                            logger.warning(f'Use trust_remote_code=True. Will invoke codes from {model_dir}.'
+                                           ' Please make sure that you can trust the external codes.'
+                                           )
                             tokenizer = AutoTokenizer.from_pretrained(
                                 model_dir, trust_remote_code=True)
                             config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
diff --git a/modelscope/utils/automodel_utils.py b/modelscope/utils/automodel_utils.py
index eb4aa6c8..9bf1459e 100644
--- a/modelscope/utils/automodel_utils.py
+++ b/modelscope/utils/automodel_utils.py
@@ -3,12 +3,15 @@ import os
 from types import MethodType
 from typing import Any, Optional
 
+from modelscope import get_logger
 from modelscope.metainfo import Tasks
 from modelscope.utils.ast_utils import INDEX_KEY
 from modelscope.utils.import_utils import (LazyImportModule,
                                            is_torch_available,
                                            is_transformers_available)
 
+logger = get_logger()
+
 
 def can_load_by_ms(model_dir: str, task_name: Optional[str],
                    model_type: Optional[str]) -> bool:
@@ -91,6 +94,9 @@ def get_hf_automodel_class(model_dir: str,
     if not os.path.exists(config_path):
         return None
     try:
+        logger.warning(
+            f'Use trust_remote_code=True. Will invoke codes from {model_dir}. Please make sure '
+            'that you can trust the external codes.')
         config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
         if task_name is None:
             automodel_class = get_default_automodel(config)
diff --git a/modelscope/utils/plugins.py b/modelscope/utils/plugins.py
index 1f191a8d..671b1ea6 100644
--- a/modelscope/utils/plugins.py
+++ b/modelscope/utils/plugins.py
@@ -451,6 +451,9 @@ def register_plugins_repo(plugins: List[str]) -> None:
 def register_modelhub_repo(model_dir, allow_remote=False) -> None:
     """ Try to install and import remote model from modelhub"""
     if allow_remote:
+        logger.warning(
+            f'Use allow_remote=True. Will invoke codes from {model_dir}. Please make sure '
+            'that you can trust the external codes.')
         try:
             import_module_from_model_dir(model_dir)
         except KeyError:

From 9aa661118c1fb1aee1f2b125a6311cf3937edb6c Mon Sep 17 00:00:00 2001
From: suluyana <110878454+suluyana@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:42:22 +0800
Subject: [PATCH 4/9] feat: all other ollama models (#1174)

* add cases

* new models

---------

Co-authored-by: suluyan <suluyan.sly@alibaba-inc.com>
---
 modelscope/preprocessors/templates/loader.py | 120 +++++++++++++++----
 tests/tools/test_to_ollama.py                |  54 +++++++++
 2 files changed, 153 insertions(+), 21 deletions(-)

diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py
index 8dac8d41..ae7460d1 100644
--- a/modelscope/preprocessors/templates/loader.py
+++ b/modelscope/preprocessors/templates/loader.py
@@ -230,6 +230,10 @@ template_info = [
         modelfile_prefix=
         'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/dolphin-mistral',
     ),
+    TemplateInfo(
+        template_regex=f'.*{cases("dolphin3", "dolphin-3")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/dolphin3'),
 
     # "phi"
     TemplateInfo(
@@ -251,6 +255,12 @@ template_info = [
         modelfile_prefix=
         'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/phi3',
     ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("phi4", "phi-4")}{no_multi_modal()}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/phi4',
+    ),
     TemplateInfo(
         template_regex=
         f'.*{cases("phi")}{no_multi_modal()}.*',
@@ -591,7 +601,7 @@ template_info = [
         template_regex=
         f'.*{cases("deepseek")}.*{cases("v2")}{no("v2.5")}{no_multi_modal()}.*{chat_suffix}.*',
         modelfile_prefix=
-        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek_v2',
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek-v2',
     ),
 
     # deepseek_coder
@@ -623,6 +633,94 @@ template_info = [
         template=TemplateType.telechat_v2,
         template_regex=f'.*{cases("TeleChat")}.*{cases("v2")}.*'),
 
+    # tulu3
+    TemplateInfo(
+        template_regex=f'.*{cases("tulu3", "tulu-3")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/tulu3'),
+
+    # athene-v2
+    TemplateInfo(
+        template_regex=f'.*{cases("athene-v2")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/athene-v2'),
+
+    # granite
+    TemplateInfo(
+        template_regex=f'.*{cases("granite-guardian-3")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite3-guardian'),
+    TemplateInfo(
+        template_regex=f'.*{cases("granite")}.*{cases("code")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite-code'),
+    TemplateInfo(
+        template_regex=f'.*{cases("granite-3.1")}.*{cases("2b", "8b")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite3.1-dense'),
+    TemplateInfo(
+        template_regex=f'.*{cases("granite-3.1")}.*{cases("1b", "3b")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite3.1-moe'),
+    TemplateInfo(
+        template_regex=f'.*{cases("granite-embedding")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite-embedding'),
+    TemplateInfo(
+        template_regex=f'.*{cases("granite-3")}.*{cases("2b", "8b")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite3-dense'),
+    TemplateInfo(
+        template_regex=f'.*{cases("granite-3")}.*{cases("1b", "3b")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite3-moe'),
+
+    # opencoder
+    TemplateInfo(
+        template_regex=f'.*{cases("opencoder")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/opencoder'),
+
+    # smollm
+    TemplateInfo(
+        template_regex=f'.*{cases("smollm2")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/smollm2'),
+    TemplateInfo(
+        template_regex=f'.*{cases("smollm")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/smollm'),
+
+    # 'aya'
+    TemplateInfo(
+        template_regex=f'.*{cases("aya-expanse")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/aya-expanse'),
+    TemplateInfo(
+        template_regex=f'.*{cases("aya")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/aya'),
+
+    # falcon
+    TemplateInfo(
+        template_regex=f'.*{cases("falcon3")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/falcon3'),
+    TemplateInfo(
+        template_regex=f'.*{cases("falcon")}.*{cases("-2")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/falcon2'),
+    TemplateInfo(
+        template_regex=f'.*{cases("falcon")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/falcon'),
+
+    # smallthinker
+    TemplateInfo(
+        template_regex=f'.*{cases("smallthinker")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/smallthinker'),
+
     TemplateInfo(
         template_regex=f'.*{cases("nomic-embed-text")}.*', 
         modelfile_prefix=
@@ -651,10 +749,6 @@ template_info = [
         template_regex=f'.*{cases("starcoder")}.*', 
         modelfile_prefix=
         'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/starcoder'),
-    TemplateInfo(
-        template_regex=f'.*{cases("granite")}.*{cases("code")}.*', 
-        modelfile_prefix=
-        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite-code'),
     TemplateInfo(
         template_regex=f'.*{cases("all-minilm")}.*', 
         modelfile_prefix=
@@ -663,10 +757,6 @@ template_info = [
         template_regex=f'.*{cases("openchat")}.*', 
         modelfile_prefix=
         'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/openchat'),
-    TemplateInfo(
-        template_regex=f'.*{cases("aya")}.*', 
-        modelfile_prefix=
-        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/aya'),
     TemplateInfo(
         template_regex=f'.*{cases("openhermes")}.*', 
         modelfile_prefix=
@@ -687,10 +777,6 @@ template_info = [
         template_regex=f'.*{cases("xwin")}.*{cases("lm")}.*', 
         modelfile_prefix=
         'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/xwinlm'),
-    TemplateInfo(
-        template_regex=f'.*{cases("smollm")}.*', 
-        modelfile_prefix=
-        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/smollm'),
     TemplateInfo(
         template_regex=f'.*{cases("sqlcoder")}.*', 
         modelfile_prefix=
@@ -699,14 +785,6 @@ template_info = [
         template_regex=f'.*{cases("starling-lm")}.*', 
         modelfile_prefix=
         'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/starling-lm'),
-    TemplateInfo(
-        template_regex=f'.*{cases("falcon")}.*{cases("-2")}.*', 
-        modelfile_prefix=
-        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/falcon2'),
-    TemplateInfo(
-        template_regex=f'.*{cases("falcon")}.*', 
-        modelfile_prefix=
-        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/falcon'),
     TemplateInfo(
         template_regex=f'.*{cases("solar-pro")}.*', 
         modelfile_prefix=
diff --git a/tests/tools/test_to_ollama.py b/tests/tools/test_to_ollama.py
index 78b885eb..3ef53727 100644
--- a/tests/tools/test_to_ollama.py
+++ b/tests/tools/test_to_ollama.py
@@ -311,6 +311,60 @@ class TestToOllama(unittest.TestCase):
                               'llama3.3')
         _test_check_tmpl_type('bartowski/EXAONE-3.5-7.8B-Instruct-GGUF',
                               'exaone3.5')
+        _test_check_tmpl_type(
+            'QuantFactory/Tulu-3.1-8B-SuperNova-Smart-GGUF',
+            'tulu3',
+            gguf_meta={'general.name': 'Tulu 3.1 8B SuperNova'})
+        _test_check_tmpl_type(
+            'bartowski/Athene-V2-Chat-GGUF',
+            'athene-v2',
+            gguf_meta={'general.name': 'Athene V2 Chat'})
+        _test_check_tmpl_type(
+            'QuantFactory/granite-guardian-3.0-2b-GGUF',
+            'granite3-guardian',
+            gguf_meta={'general.name': 'Models'})
+        _test_check_tmpl_type('lmstudio-community/OpenCoder-8B-Instruct-GGUF',
+                              'opencoder')
+        _test_check_tmpl_type(
+            'QuantFactory/SmolLM2-1.7B-Instruct-GGUF',
+            'smollm2',
+            gguf_meta={'general.name': 'Smollm2 1.7B 8k Mix7 Ep2 v2'})
+        _test_check_tmpl_type(
+            'prithivMLmods/Aya-Expanse-8B-GGUF',
+            'aya-expanse',
+            gguf_meta={'general.name': 'Aya Expanse 8b'})
+        _test_check_tmpl_type('lmstudio-community/Falcon3-7B-Instruct-GGUF',
+                              'falcon3')
+        _test_check_tmpl_type(
+            'lmstudio-community/granite-3.1-8b-instruct-GGUF',
+            'granite3.1-dense',
+            gguf_meta={'general.name': 'Granite 3.1 8b Instruct'})
+        _test_check_tmpl_type(
+            'lmstudio-community/granite-3.1-2b-instruct-GGUF',
+            'granite3.1-dense',
+            gguf_meta={'general.name': 'Granite 3.1 2b Instruct'})
+        _test_check_tmpl_type(
+            'lmstudio-community/granite-embedding-278m-multilingual-GGUF',
+            'granite-embedding',
+            gguf_meta={'general.name': 'Granite Embedding 278m Multilingual'})
+        _test_check_tmpl_type(
+            'QuantFactory/granite-3.1-3b-a800m-instruct-GGUF',
+            'granite3.1-moe',
+            gguf_meta={'general.name': 'Granite 3.1 3b A800M Base'})
+        _test_check_tmpl_type(
+            'bartowski/granite-3.1-1b-a400m-instruct-GGUF',
+            'granite3.1-moe',
+            gguf_meta={'general.name': 'Granite 3.1 1b A400M Instruct'})
+        _test_check_tmpl_type(
+            'bartowski/SmallThinker-3B-Preview-GGUF',
+            'smallthinker',
+            gguf_meta={'general.name': 'SmallThinker 3B Preview'})
+        _test_check_tmpl_type(
+            'bartowski/Dolphin3.0-Llama3.1-8B-GGUF',
+            'dolphin3',
+            gguf_meta={'general.name': 'Dolphin 3.0 Llama 3.1 8B'})
+        _test_check_tmpl_type(
+            'AI-ModelScope/phi-4', 'phi4', gguf_meta={'general.name': 'Phi 4'})
 
 
 if __name__ == '__main__':

From 26812dc028d284bdabc2ac6acc9b782df9e0f970 Mon Sep 17 00:00:00 2001
From: Yunlin Mao <mao.looper@qq.com>
Date: Fri, 10 Jan 2025 17:56:05 +0800
Subject: [PATCH 5/9] Unify datasets cache dir (#1178)

* fix cache

* fix lint

* fix dataset cache

* fix lint

* remove
---
 modelscope/utils/config_ds.py  | 9 +++++----
 modelscope/utils/file_utils.py | 7 ++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/modelscope/utils/config_ds.py b/modelscope/utils/config_ds.py
index 72a25887..eae85888 100644
--- a/modelscope/utils/config_ds.py
+++ b/modelscope/utils/config_ds.py
@@ -5,13 +5,14 @@ from pathlib import Path
 
 # Cache location
 from modelscope.hub.constants import DEFAULT_MODELSCOPE_DATA_ENDPOINT
-from modelscope.utils.file_utils import get_modelscope_cache_dir
+from modelscope.utils.file_utils import (get_dataset_cache_root,
+                                         get_modelscope_cache_dir)
 
 MS_CACHE_HOME = get_modelscope_cache_dir()
 
-DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'hub', 'datasets')
-MS_DATASETS_CACHE = Path(
-    os.getenv('MS_DATASETS_CACHE', DEFAULT_MS_DATASETS_CACHE))
+# NOTE: removed `MS_DATASETS_CACHE` env,
+# default is `~/.cache/modelscope/hub/datasets`
+MS_DATASETS_CACHE = get_dataset_cache_root()
 
 DOWNLOADED_DATASETS_DIR = 'downloads'
 DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE,
diff --git a/modelscope/utils/file_utils.py b/modelscope/utils/file_utils.py
index c00e8d26..50f66e34 100644
--- a/modelscope/utils/file_utils.py
+++ b/modelscope/utils/file_utils.py
@@ -60,11 +60,16 @@ def get_model_cache_root() -> str:
 
 def get_dataset_cache_root() -> str:
     """Get dataset raw file cache root path.
+    if `MODELSCOPE_CACHE` is set, return `MODELSCOPE_CACHE/datasets`,
+    else return `~/.cache/modelscope/hub/datasets`
 
     Returns:
         str: the modelscope dataset raw file cache root.
     """
-    return os.path.join(get_modelscope_cache_dir(), 'datasets')
+    if os.getenv('MODELSCOPE_CACHE'):
+        return os.path.join(get_modelscope_cache_dir(), 'datasets')
+    else:
+        return os.path.join(get_modelscope_cache_dir(), 'hub', 'datasets')
 
 
 def get_dataset_cache_dir(dataset_id: str) -> str:

From 8f3358fe50058f752deb0e56e5d0c53082c2aa7e Mon Sep 17 00:00:00 2001
From: "Xingjun.Wang" <wangxingjun778@163.com>
Date: Fri, 10 Jan 2025 23:38:46 +0800
Subject: [PATCH 6/9] Add repo_id and repo_type in snapshot_download (#1172)

* add repo_id and repo_type in snapshot_download

* fix positional args

* update
---
 modelscope/hub/snapshot_download.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index 2f7f4790..31d1f091 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -19,6 +19,7 @@ from modelscope.hub.utils.utils import (get_model_masked_directory,
                                         model_id_to_group_owner_name)
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
+                                       DEFAULT_REPOSITORY_REVISION,
                                        REPO_TYPE_DATASET, REPO_TYPE_MODEL,
                                        REPO_TYPE_SUPPORT)
 from modelscope.utils.logger import get_logger
@@ -28,8 +29,8 @@ logger = get_logger()
 
 
 def snapshot_download(
-    model_id: str,
-    revision: Optional[str] = DEFAULT_MODEL_REVISION,
+    model_id: str = None,
+    revision: Optional[str] = None,
     cache_dir: Union[str, Path, None] = None,
     user_agent: Optional[Union[Dict, str]] = None,
     local_files_only: Optional[bool] = False,
@@ -40,6 +41,8 @@ def snapshot_download(
     allow_patterns: Optional[Union[List[str], str]] = None,
     ignore_patterns: Optional[Union[List[str], str]] = None,
     max_workers: int = 8,
+    repo_id: str = None,
+    repo_type: Optional[str] = REPO_TYPE_MODEL,
 ) -> str:
     """Download all files of a repo.
     Downloads a whole snapshot of a repo's files at the specified revision. This
@@ -51,7 +54,10 @@ def snapshot_download(
     user always has git and git-lfs installed, and properly configured.
 
     Args:
-        model_id (str): A user or an organization name and a repo name separated by a `/`.
+        repo_id (str): A user or an organization name and a repo name separated by a `/`.
+        model_id (str): A user or an organization name and a model name separated by a `/`.
+            if `repo_id` is provided, `model_id` will be ignored.
+        repo_type (str, optional): The type of the repo, either 'model' or 'dataset'.
         revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a
             commit hash. NOTE: currently only branch and tag name is supported
         cache_dir (str, Path, optional): Path to the folder where cached files are stored, model will
@@ -87,9 +93,22 @@ def snapshot_download(
         - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
         if some parameter value is invalid
     """
+
+    repo_id = repo_id or model_id
+    if not repo_id:
+        raise ValueError('Please provide a valid model_id or repo_id')
+
+    if repo_type not in REPO_TYPE_SUPPORT:
+        raise ValueError(
+            f'Invalid repo type: {repo_type}, only support: {REPO_TYPE_SUPPORT}'
+        )
+
+    if revision is None:
+        revision = DEFAULT_DATASET_REVISION if repo_type == REPO_TYPE_DATASET else DEFAULT_MODEL_REVISION
+
     return _snapshot_download(
-        model_id,
-        repo_type=REPO_TYPE_MODEL,
+        repo_id,
+        repo_type=repo_type,
         revision=revision,
         cache_dir=cache_dir,
         user_agent=user_agent,

From 845a0bd5fef424ae6b30f79500e546822bd418a1 Mon Sep 17 00:00:00 2001
From: suluyana <110878454+suluyana@users.noreply.github.com>
Date: Mon, 13 Jan 2025 10:05:42 +0800
Subject: [PATCH 7/9] Fix/text gen (#1177)

* fix text-gen: read pipeline type from configuration.json first

---------

Co-authored-by: suluyan <suluyan.sly@alibaba-inc.com>
---
 modelscope/pipelines/builder.py | 70 +++++++++++++++++++--------------
 modelscope/utils/hub.py         |  2 +
 2 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 79459161..596d6d22 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -108,30 +108,7 @@ def pipeline(task: str = None,
     """
     if task is None and pipeline_name is None:
         raise ValueError('task or pipeline_name is required')
-    prefer_llm_pipeline = kwargs.get('external_engine_for_llm')
-    if task is not None and task.lower() in [
-            Tasks.text_generation, Tasks.chat
-    ]:
-        # if not specified, prefer llm pipeline for aforementioned tasks
-        if prefer_llm_pipeline is None:
-            prefer_llm_pipeline = True
-    # for llm pipeline, if llm_framework is not specified, default to swift instead
-    # TODO: port the swift infer based on transformer into ModelScope
-    if prefer_llm_pipeline and kwargs.get('llm_framework') is None:
-        kwargs['llm_framework'] = 'swift'
-    third_party = kwargs.get(ThirdParty.KEY)
-    if third_party is not None:
-        kwargs.pop(ThirdParty.KEY)
-    if pipeline_name is None and prefer_llm_pipeline:
-        pipeline_name = external_engine_for_llm_checker(
-            model, model_revision, kwargs)
-    if pipeline_name is None:
-        model = normalize_model_input(
-            model,
-            model_revision,
-            third_party=third_party,
-            ignore_file_pattern=ignore_file_pattern)
-    pipeline_props = {'type': pipeline_name}
+
     if pipeline_name is None:
         # get default pipeline for this task
         if isinstance(model, str) \
@@ -142,16 +119,47 @@ def pipeline(task: str = None,
                     model, revision=model_revision) if isinstance(
                         model, str) else read_config(
                             model[0], revision=model_revision)
-                register_plugins_repo(cfg.safe_get('plugins'))
-                register_modelhub_repo(model, cfg.get('allow_remote', False))
-                pipeline_name = external_engine_for_llm_checker(
-                    model, model_revision,
-                    kwargs) if prefer_llm_pipeline else None
-                if pipeline_name is not None:
+                if cfg:
+                    pipeline_name = cfg.safe_get('pipeline',
+                                                 {}).get('type', None)
+
+                if pipeline_name is None:
+                    prefer_llm_pipeline = kwargs.get('external_engine_for_llm')
+                    # if not specified in both args and configuration.json, prefer llm pipeline for aforementioned tasks
+                    if task is not None and task.lower() in [
+                            Tasks.text_generation, Tasks.chat
+                    ]:
+                        if prefer_llm_pipeline is None:
+                            prefer_llm_pipeline = True
+                    # for llm pipeline, if llm_framework is not specified, default to swift instead
+                    # TODO: port the swift infer based on transformer into ModelScope
+                    if prefer_llm_pipeline:
+                        if kwargs.get('llm_framework') is None:
+                            kwargs['llm_framework'] = 'swift'
+                        pipeline_name = external_engine_for_llm_checker(
+                            model, model_revision, kwargs)
+
+                if pipeline_name is None or pipeline_name != 'llm':
+                    third_party = kwargs.get(ThirdParty.KEY)
+                    if third_party is not None:
+                        kwargs.pop(ThirdParty.KEY)
+
+                    model = normalize_model_input(
+                        model,
+                        model_revision,
+                        third_party=third_party,
+                        ignore_file_pattern=ignore_file_pattern)
+
+                    register_plugins_repo(cfg.safe_get('plugins'))
+                    register_modelhub_repo(model,
+                                           cfg.get('allow_remote', False))
+
+                if pipeline_name:
                     pipeline_props = {'type': pipeline_name}
                 else:
                     check_config(cfg)
                     pipeline_props = cfg.pipeline
+
         elif model is not None:
             # get pipeline info from Model object
             first_model = model[0] if isinstance(model, list) else model
@@ -165,6 +173,8 @@ def pipeline(task: str = None,
             pipeline_name, default_model_repo = get_default_pipeline_info(task)
             model = normalize_model_input(default_model_repo, model_revision)
             pipeline_props = {'type': pipeline_name}
+    else:
+        pipeline_props = {'type': pipeline_name}
 
     pipeline_props['model'] = model
     pipeline_props['device'] = device
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 39ca644a..79df8545 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -54,6 +54,8 @@ def read_config(model_id_or_path: str,
         local_path = os.path.join(model_id_or_path, ModelFile.CONFIGURATION)
     elif os.path.isfile(model_id_or_path):
         local_path = model_id_or_path
+    else:
+        return None
 
     return Config.from_file(local_path)
 

From bceb7196d1bb1b09a86149fd97f95bea49c07e7c Mon Sep 17 00:00:00 2001
From: Yunlin Mao <mao.looper@qq.com>
Date: Mon, 13 Jan 2025 21:03:59 +0800
Subject: [PATCH 8/9] update doc with llama_index (#1180)

---
 ...rch_QA_based_on_langchain_llamaindex.ipynb | 342 +++++++++++++++++-
 ...en_doc_search_QA_based_on_llamaindex.ipynb | 126 +++++--
 2 files changed, 422 insertions(+), 46 deletions(-)

diff --git a/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb b/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb
index e6ddabfd..a66c079d 100644
--- a/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb
+++ b/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb
@@ -2,6 +2,10 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "8230365523c9330a",
+   "metadata": {
+    "collapsed": false
+   },
    "source": [
     "# Usage\n",
     "1. Install python dependencies\n",
@@ -30,11 +34,7 @@
     "``` \n",
     "\n",
     "3. Enjoy your QA AI"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "8230365523c9330a"
+   ]
   },
   {
    "cell_type": "code",
@@ -94,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "1cb8feca-c71f-4ad6-8eff-caae95411aa0",
    "metadata": {
     "ExecutionIndicator": {
@@ -109,7 +109,312 @@
     },
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading Model to directory: /mnt/workspace/.cache/modelscope/Qwen/Qwen-1_8B-Chat\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading Model to directory: /mnt/workspace/.cache/modelscope/Qwen/Qwen-1_8B-Chat\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Try importing flash-attention for faster inference...\n",
+      "Warning: import flash_attn rotary fail, please install FlashAttention rotary to get higher efficiency https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary\n",
+      "Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get higher efficiency https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm\n",
+      "Warning: import flash_attn fail, please install FlashAttention to get higher efficiency https://github.com/Dao-AILab/flash-attention\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "114cb3d66e9e4f6694ba66c91fc4b8a9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-01-13 15:44:50,172 - modelscope.ast - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "STEP1: qianwen LLM created\n",
+      "STEP2: reading docs ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-01-13 15:44:50,565 - modelscope.ast - INFO - Updating the files for the changes of local files, first time updating will take longer time! Please wait till updating done!\n",
+      "2025-01-13 15:44:50,588 - modelscope.ast - INFO - AST-Scanning the path \"/mnt/data/data/user/maoyunlin.myl/modelscope/package/modelscope\" with the following sub folders ['models', 'metrics', 'pipelines', 'preprocessors', 'trainers', 'msdatasets', 'exporters']\n",
+      "2025-01-13 15:44:58,393 - modelscope.ast - INFO - Scanning done! A number of 987 components indexed or updated! Time consumed 7.80418848991394s\n",
+      "2025-01-13 15:44:58,770 - modelscope.ast - INFO - Loading done! Current index file version is 2.0.0, with md5 78f3257e373ec3a7089c4256261fb13f and a total number of 987 components indexed\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading Model to directory: /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-small\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-01-13 15:44:59,577 - modelscope - INFO - Model revision not specified, using default: [master] version.\n",
+      "2025-01-13 15:44:59,812 - modelscope - INFO - Got 9 files, start to download ...\n",
+      "Processing 9 items:   0%|          | 0/9 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "506bd9e27c154fe4aca6fe3bc364cb52",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [config.json]:   0%|          | 0.00/772 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cc7af37066694704bb07c85f5d9bcd6c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [tokenizer.json]:   0%|          | 0.00/291k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "042eed07f85b420bb8d908d3a9b001a4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [pytorch_model.bin]:   0%|          | 0.00/57.7M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e20c9b7d9a0246f0bedaa75606e7094e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [configuration.json]:   0%|          | 0.00/2.02k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a3a7ec13561049d997d86bbe3fb9143e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [special_tokens_map.json]:   0%|          | 0.00/125 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "496e78a215c54e21b334157c700c1522",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [README.md]:   0%|          | 0.00/14.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5e9646c5f077462e9db0900939e9359c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [resources/dual-encoder.png]:   0%|          | 0.00/60.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91a04e5faaef45fba21cd3644866fcc8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [tokenizer_config.json]:   0%|          | 0.00/425 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing 9 items:  11%|█         | 1/9 [00:00<00:02,  3.73it/s]"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1668295ed6c141d88e6b0dddc66108d7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading [vocab.txt]:   0%|          | 0.00/68.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing 9 items: 100%|██████████| 9/9 [00:03<00:00,  2.71it/s]\n",
+      "2025-01-13 15:45:03,131 - modelscope - INFO - Download model 'damo/nlp_gte_sentence-embedding_chinese-small' successfully.\n",
+      "2025-01-13 15:45:03,158 - modelscope - INFO - initiate model from /mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-small\n",
+      "2025-01-13 15:45:03,159 - modelscope - INFO - initiate model from location /mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-small.\n",
+      "2025-01-13 15:45:03,162 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-small\n",
+      "/root/miniconda3/envs/modelscope_1.21/lib/python3.9/site-packages/transformers/modeling_utils.py:488: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  return torch.load(checkpoint_file, map_location=map_location)\n",
+      "2025-01-13 15:45:04,093 - modelscope - WARNING - No preprocessor field found in cfg.\n",
+      "2025-01-13 15:45:04,094 - modelscope - WARNING - No val key and type key found in preprocessor domain of configuration.json file.\n",
+      "2025-01-13 15:45:04,095 - modelscope - WARNING - Cannot find available config to build preprocessor at mode inference, current config: {'model_dir': '/mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-small'}. trying to build by task and model information.\n",
+      "2025-01-13 15:45:04,136 - modelscope - WARNING - No preprocessor field found in cfg.\n",
+      "2025-01-13 15:45:04,137 - modelscope - WARNING - No val key and type key found in preprocessor domain of configuration.json file.\n",
+      "2025-01-13 15:45:04,137 - modelscope - WARNING - Cannot find available config to build preprocessor at mode inference, current config: {'model_dir': '/mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-small', 'sequence_length': 128}. trying to build by task and model information.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LLM is explicitly disabled. Using MockLLM.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/miniconda3/envs/modelscope_1.21/lib/python3.9/site-packages/transformers/modeling_utils.py:909: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " 2.2 reading doc done, vec db created.\n",
+      "STEP3: chat prompt template created.\n",
+      "@@@ query= 西安交大的校训是什么？\n",
+      "@@@@ source= Node ID: 1392eeb8-1e5f-48fe-a31a-213e120fb5ba\n",
+      "Text: 西安交通大学是我国最早兴办、享誉海内外的著名高等学府，是教育部直属重点大学。西迁以来，一代代交大人扎根西部、服务国家，为西部发展\n",
+      "和国家建设作出了卓越贡献，以实际行动铸就了第一批纳入中国共产党人精神谱系的西迁精神。2017年12月，习近平总书记对学校15位老教授来信作出\n",
+      "重要指示。在2018年新年贺词中，习近平总书记再次提到“西安交大西迁的老教授”。2020年4月22日，习近平总书记来校考察并发表重要讲话，强\n",
+      "调西迁精神的核心是爱国主义，精髓是听党指挥跟党走，与党和国家、与民族和人民同呼吸、共命运，勉励师生在新时代创造属于我们这代人的历史功绩，给全\n",
+      "校师生以巨大关怀和极大鼓舞，为学校新时代建设中国特色世界一流大学提供了根本遵循和行动指南。\n",
+      "十九世纪末，甲午战败，民族危难。近代著名实业家、教育...\n",
+      "Score:  0.916\n",
+      "\n",
+      "@@@@ source= Node ID: aa992526-59f5-4bb9-95b9-43291b821ce4\n",
+      "Text: 2000年国务院决定将西安交通大学、西安医科大学、陕西财经学院三校合并，组建新的西安交通大学。\n",
+      "学校是“七五”“八五”重点建设单位，首批进入国家“211”和“985”工程建设学校。2017 年入选国家一流大学建设名单 A\n",
+      "类建设高校，2022 年入选国家第二轮“双一流”建设高校，8 个学科入选“双一流”建设学科。据 ESI 公布的数据，截至 2023 年 5\n",
+      "月，学校 17 个学科进入世界学术机构前 1%，5 个学科进入前 1‰，其中工程学进入前万分之一。  学校是涵盖理、工、医、经、管、文、法、\n",
+      "哲、艺、教育、交叉等11个学科门类的综合性研究型大学，设有32个学院（部、中心）、9个本科书院和3所直属附属医院。现有在编教工6635人，其\n",
+      "中专任教师3789人。师资队伍中入选院士、杰青等国...\n",
+      "Score:  0.874\n",
+      "\n",
+      "Human: 请基于```内的内容回答问题。\"\n",
+      "```\n",
+      "[Document(metadata={'file_path': '/mnt/workspace/custom_data/xianjiaoda.md', 'file_name': 'xianjiaoda.md', 'file_type': 'text/markdown', 'file_size': 13228, 'creation_date': '2025-01-13', 'last_modified_date': '2024-01-16'}, page_content='西安交通大学是我国最早兴办、享誉海内外的著名高等学府，是教育部直属重点大学。西迁以来，一代代交大人扎根西部、服务国家，为西部发展和国家建设作出了卓越贡献，以实际行动铸就了第一批纳入中国共产党人精神谱系的西迁精神。2017年12月，习近平总书记对学校15位老教授来信作出重要指示。在2018年新年贺词中，习近平总书记再次提到“西安交大西迁的老教授”。2020年4月22日，习近平总书记来校考察并发表重要讲话，强调西迁精神的核心是爱国主义，精髓是听党指挥跟党走，与党和国家、与民族和人民同呼吸、共命运，勉励师生在新时代创造属于我们这代人的历史功绩，给全校师生以巨大关怀和极大鼓舞，为学校新时代建设中国特色世界一流大学提供了根本遵循和行动指南。\\n\\n十九世纪末，甲午战败，民族危难。近代著名实业家、教育家盛宣怀秉持“自强首在储才，储才必先兴学”的信念，于1896年在上海创建了南洋公学，1921年定名为交通大学。学校坚持“求实学、务实业”办学宗旨，强调“修一等品行、求一等学问、创一等事业、成一等人才”办学目标。至二十世纪二三十年代，成为独具“理工管”特色的著名大学。抗战时期，学校移至租界，内迁重庆，坚持沪渝两地办学，为抵御外侮，不少学生投笔从戎，浴血沙场。解放前夕，师生积极投入民主革命和解放斗争，学校被誉为“民主堡垒”。\\n1955年中央决定交通大学内迁西安；1956年起师生分批迁赴西安；1957年分设为交通大学西安、上海两个部分，实行统一领导；1959年，交通大学西安部分定名为西安交通大学，同年被列为全国十六所重点大学之一。2000年国务院决定将西安交通大学、西安医科大学、陕西财经学院三校合并，组建新的西安交通大学。\\n\\n学校是“七五”“八五”重点建设单位，首批进入国家“211”和“985”工程建设学校。2017 年入选国家一流大学建设名单 A 类建设高校，2022 年入选国家第二轮“双一流”建设高校，8 个学科入选“双一流”建设学科。'), Document(metadata={'file_path': '/mnt/workspace/custom_data/xianjiaoda.md', 'file_name': 'xianjiaoda.md', 'file_type': 'text/markdown', 'file_size': 13228, 'creation_date': '2025-01-13', 'last_modified_date': '2024-01-16'}, page_content='2000年国务院决定将西安交通大学、西安医科大学、陕西财经学院三校合并，组建新的西安交通大学。\\n\\n学校是“七五”“八五”重点建设单位，首批进入国家“211”和“985”工程建设学校。2017 年入选国家一流大学建设名单 A 类建设高校，2022 年入选国家第二轮“双一流”建设高校，8 个学科入选“双一流”建设学科。据 ESI 公布的数据，截至 2023 年 5 月，学校 17 个学科进入世界学术机构前 1%，5 个学科进入前 1‰，其中工程学进入前万分之一。\\n\\n学校是涵盖理、工、医、经、管、文、法、哲、艺、教育、交叉等11个学科门类的综合性研究型大学，设有32个学院（部、中心）、9个本科书院和3所直属附属医院。现有在编教工6635人，其中专任教师3789人。师资队伍中入选院士、杰青等国家级各类重大人才工程545人次，获评国家级各类创新团队51个，为国家作出突出贡献并享受政府特殊津贴专家450名，国家级教学名师11名。\\n\\n学校现有学生54760名，其中本科生22407名，研究生29285名，留学生3068名；本科招生专业76个、博士学位授权一级学科36个、硕士学位授权一级学科43个、博士专业学位授权点6个、硕士专业学位授权点29个，博士后流动站30个，国家一级重点学科8个、国家二级重点学科8个、国家重点（培育）学科3个，全国（国家）重点实验室8个，国家工程（技术）研究中心10个，国家产教融合创新平台2个，国家国际科技合作基地5个，国家应用数学中心1个，2011协同创新中心1个、其他省部级及以上重点科研基地195个。\\n\\n建校127年来，形成了兴学强国、艰苦创业、崇德尚实、严谨治学的优良传统，起点高、基础厚、要求严、重实践的办学特色，培养出了一大批卓越的政治家、科学家、社会活动家、教育家、企业家、艺术家、医学专家等，如蔡锷、张元济、蔡元培、黄炎培、邵力子、李叔同、邹韬奋、陆定一、钱学森、张光斗、汪道涵、吴文俊、杨嘉墀、徐光宪、姚桐斌、陈能宽、江泽民、侯宗濂、黄旭华、顾诵芬、丁关根、吴自良、蒋新松、蒋正华、王希季、李金华、韩启德等。')]\n",
+      "```\n",
+      "我的问题是：西安交大的校训是什么？。\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/miniconda3/envs/modelscope_1.21/lib/python3.9/site-packages/transformers/modeling_utils.py:909: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([1., 2.], device='cuda:0')\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'\"求实学、务实业\"'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import os\n",
     "from abc import ABC\n",
@@ -121,14 +426,11 @@
     "from langchain_core.prompts import ChatPromptTemplate\n",
     "from langchain_core.runnables import RunnablePassthrough\n",
     "from modelscope import AutoModelForCausalLM, AutoTokenizer\n",
-    "from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader\n",
-    "from llama_index import ServiceContext\n",
-    "from llama_index.embeddings.base import BaseEmbedding\n",
-    "from llama_index import set_global_service_context\n",
+    "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings\n",
+    "from llama_index.core.embeddings import BaseEmbedding\n",
     "from langchain_core.retrievers import BaseRetriever\n",
     "from langchain_core.callbacks import CallbackManagerForRetrieverRun\n",
     "from langchain_core.documents import Document\n",
-    "from llama_index.retrievers import VectorIndexRetriever\n",
     "\n",
     "# configs for LLM\n",
     "llm_name = \"Qwen/Qwen-1_8B-Chat\"\n",
@@ -195,8 +497,8 @@
     "    ) -> List[Document]:\n",
     "        \"\"\"Get documents relevant for a query.\"\"\"\n",
     "        try:\n",
-    "            from llama_index.indices.base import BaseIndex\n",
-    "            from llama_index.response.schema import Response\n",
+    "            from llama_index.core.indices.base import BaseIndex\n",
+    "            from llama_index.core import Response\n",
     "        except ImportError:\n",
     "            raise ImportError(\n",
     "                \"You need to install `pip install llama-index` to use this retriever.\"\n",
@@ -239,9 +541,9 @@
     "\n",
     "# define QianWen LLM based on langchain's LLM to use models in Modelscope\n",
     "class QianWenChatLLM(LLM):\n",
-    "    max_length = 10000\n",
+    "    max_length: int = 10000\n",
     "    temperature: float = 0.01\n",
-    "    top_p = 0.9\n",
+    "    top_p: float = 0.9\n",
     "\n",
     "    def __init__(self):\n",
     "        super().__init__()\n",
@@ -270,11 +572,11 @@
     "# STEP2: load knowledge file and initialize vector db by llamaIndex\n",
     "print('STEP2: reading docs ...')\n",
     "embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)\n",
-    "service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=None)\n",
-    "set_global_service_context(service_context)     # global config, not good\n",
+    "Settings.llm = None\n",
+    "Settings.embed_model=embeddings  # global config, not good\n",
     "\n",
     "llamaIndex_docs = SimpleDirectoryReader(knowledge_doc_file_dir).load_data()\n",
-    "llamaIndex_index = GPTVectorStoreIndex.from_documents(llamaIndex_docs, chunk_size=512)\n",
+    "llamaIndex_index = VectorStoreIndex.from_documents(llamaIndex_docs, chunk_size=512)\n",
     "retriever = LlamaIndexRetriever(index=llamaIndex_index)\n",
     "print(' 2.2 reading doc done, vec db created.')\n",
     "\n",
@@ -318,7 +620,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.9.20"
   }
  },
  "nbformat": 4,
diff --git a/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb b/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb
index 194c46a2..51505e84 100644
--- a/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb
+++ b/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb
@@ -2,6 +2,10 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "f4abc589d9bfffca",
+   "metadata": {
+    "collapsed": false
+   },
    "source": [
     "# Usage\n",
     "\n",
@@ -34,27 +38,29 @@
     "```\n",
     "\n",
     "## 3. Go!"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "f4abc589d9bfffca"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "id": "c32122833dd7b8c8",
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "!pip install modelscope\n",
     "!pip install transformers -U\n",
     "!pip install llama-index llama-index-llms-huggingface ipywidgets "
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "c32122833dd7b8c8"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "id": "63704e2b21a9ba52",
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n",
@@ -74,15 +80,90 @@
     "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n",
     "\n",
     "!cd /mnt/workspace"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "63704e2b21a9ba52"
+   ]
   },
   {
    "cell_type": "code",
-   "outputs": [],
+   "execution_count": 2,
+   "id": "eef67659e94045c5",
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading Model to directory: /mnt/workspace/.cache/modelscope/qwen/Qwen1.5-4B-Chat\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-01-13 15:52:53,260 - modelscope - INFO - Model revision not specified, using default: [master] version.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-01-13 15:52:53,637 - modelscope - INFO - Creating symbolic link [/mnt/workspace/.cache/modelscope/qwen/Qwen1.5-4B-Chat].\n",
+      "2025-01-13 15:52:53,638 - modelscope - WARNING - Failed to create symbolic link /mnt/workspace/.cache/modelscope/qwen/Qwen1.5-4B-Chat for /mnt/workspace/.cache/modelscope/qwen/Qwen1___5-4B-Chat.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4523c5dd31ba411d95cc0ce9e5da8ded",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "llm created\n",
+      "Downloading Model to directory: /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-01-13 15:53:01,651 - modelscope - INFO - Model revision not specified, using default: [master] version.\n",
+      "2025-01-13 15:53:01,894 - modelscope - INFO - initiate model from /mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-base\n",
+      "2025-01-13 15:53:01,895 - modelscope - INFO - initiate model from location /mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-base.\n",
+      "2025-01-13 15:53:01,898 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-base\n",
+      "2025-01-13 15:53:02,532 - modelscope - WARNING - No preprocessor field found in cfg.\n",
+      "2025-01-13 15:53:02,533 - modelscope - WARNING - No val key and type key found in preprocessor domain of configuration.json file.\n",
+      "2025-01-13 15:53:02,533 - modelscope - WARNING - Cannot find available config to build preprocessor at mode inference, current config: {'model_dir': '/mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-base'}. trying to build by task and model information.\n",
+      "2025-01-13 15:53:02,588 - modelscope - WARNING - No preprocessor field found in cfg.\n",
+      "2025-01-13 15:53:02,588 - modelscope - WARNING - No val key and type key found in preprocessor domain of configuration.json file.\n",
+      "2025-01-13 15:53:02,589 - modelscope - WARNING - Cannot find available config to build preprocessor at mode inference, current config: {'model_dir': '/mnt/workspace/.cache/modelscope/hub/damo/nlp_gte_sentence-embedding_chinese-base', 'sequence_length': 128}. trying to build by task and model information.\n",
+      "/root/miniconda3/envs/modelscope_1.21/lib/python3.9/site-packages/transformers/modeling_utils.py:1044: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/root/miniconda3/envs/modelscope_1.21/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:628: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.0` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
+      "  warnings.warn(\n",
+      "/root/miniconda3/envs/modelscope_1.21/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:633: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "作为一所历史悠久的综合性研究型大学，西安交通大学有着丰富的校训文化。其中，\"厚德博学，求是创新\"是其最为人所知的校训之一。这句校训不仅体现了学校的教育理念，也反映了学校对学生的期望和要求。此外，西安交通大学还有一句著名的校训：\"明德尚志，自强不息\"，这也是学校对学生的一种激励和引导。这两句校训都强调了教育的重要性，以及学生应该具备的道德品质和自我提升的精神。\n"
+     ]
+    }
+   ],
    "source": [
     "import logging\n",
     "import sys\n",
@@ -93,9 +174,7 @@
     "from llama_index.core import (\n",
     "    SimpleDirectoryReader,\n",
     "    VectorStoreIndex,\n",
-    "    Settings,\n",
-    "    ServiceContext,\n",
-    "    set_global_service_context,\n",
+    "    Settings\n",
     ")\n",
     "from llama_index.core.base.embeddings.base import BaseEmbedding, Embedding\n",
     "from llama_index.core.prompts import PromptTemplate\n",
@@ -176,9 +255,8 @@
     "\n",
     "embedding_model = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n",
     "embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)\n",
-    "service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=llm)\n",
-    "set_global_service_context(service_context)\n",
     "Settings.embed_model = embeddings\n",
+    "Settings.llm = llm\n",
     "\n",
     "# load example documents\n",
     "documents = SimpleDirectoryReader(\"/mnt/workspace/custom_data/\").load_data()\n",
@@ -192,11 +270,7 @@
     "# do query\n",
     "response = query_engine.query(\"西安较大的校训是什么\")\n",
     "print(response)\n"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "eef67659e94045c5"
+   ]
   }
  ],
  "metadata": {

From f77b51e3c925f8dc49df40c3a68127c9ae8f3e64 Mon Sep 17 00:00:00 2001
From: "Xingjun.Wang" <wangxingjun778@163.com>
Date: Tue, 14 Jan 2025 23:59:53 +0800
Subject: [PATCH 9/9] Support upload file and folder in the hub api (#1152)

* update features

* update api

* add upload_file and thread_executor

* update upload file

* update api.py

* add cli for uploading

* run lint

* lint in msdataset

* temp

* add tqdm_desc in thread_executor

* update

* refine upload_file and upload_folder

* add endpoint for cli

* add uploading checker

* add path_or_fileobj and path_in_repo check in upload_file func

* add size limit to lfs: 1MB by default

* update lfs limit size: 10MB

* 5MB lfs limit

* fix test issue

* add pbar for upload_blob; del size_to_chunk_mb; fix allow_patterns and ignore_patterns

* fix commit uploaded blobs

* add update action for folder

* fix issues

* add normal files check

* update

* update

* set normal file size limit to 500MB

* update tqdm
---
 docs/source/command.md              |   3 +-
 modelscope/cli/cli.py               |   2 +
 modelscope/cli/upload.py            | 179 ++++++++
 modelscope/hub/api.py               | 677 +++++++++++++++++++++++++++-
 modelscope/hub/constants.py         |   1 -
 modelscope/msdatasets/ms_dataset.py |  23 +-
 modelscope/utils/file_utils.py      |  86 +++-
 modelscope/utils/repo_utils.py      | 479 ++++++++++++++++++++
 modelscope/utils/thread_utils.py    |   9 +-
 9 files changed, 1442 insertions(+), 17 deletions(-)
 create mode 100644 modelscope/cli/upload.py
 create mode 100644 modelscope/utils/repo_utils.py

diff --git a/docs/source/command.md b/docs/source/command.md
index 2d5c73fb..5d81328a 100644
--- a/docs/source/command.md
+++ b/docs/source/command.md
@@ -24,7 +24,7 @@ options:
 Get access token: [我的页面](https://modelscope.cn/my/myaccesstoken)获取**SDK 令牌**
 
 
-## download model
+## download
 ```bash
 modelscope download --help
 
@@ -36,6 +36,7 @@ modelscope download --help
     options:
       -h, --help            show this help message and exit
       --model MODEL         The model id to be downloaded.
+      --dataset DATASET     The dataset id to be downloaded.
       --revision REVISION   Revision of the model.
       --cache_dir CACHE_DIR
                             Cache directory to save model.
diff --git a/modelscope/cli/cli.py b/modelscope/cli/cli.py
index 24fcc134..afadfa91 100644
--- a/modelscope/cli/cli.py
+++ b/modelscope/cli/cli.py
@@ -11,6 +11,7 @@ from modelscope.cli.modelcard import ModelCardCMD
 from modelscope.cli.pipeline import PipelineCMD
 from modelscope.cli.plugins import PluginsCMD
 from modelscope.cli.server import ServerCMD
+from modelscope.cli.upload import UploadCMD
 from modelscope.hub.api import HubApi
 from modelscope.utils.logger import get_logger
 
@@ -25,6 +26,7 @@ def run_cmd():
     subparsers = parser.add_subparsers(help='modelscope commands helpers')
 
     DownloadCMD.define_args(subparsers)
+    UploadCMD.define_args(subparsers)
     ClearCacheCMD.define_args(subparsers)
     PluginsCMD.define_args(subparsers)
     PipelineCMD.define_args(subparsers)
diff --git a/modelscope/cli/upload.py b/modelscope/cli/upload.py
new file mode 100644
index 00000000..4e88db80
--- /dev/null
+++ b/modelscope/cli/upload.py
@@ -0,0 +1,179 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from argparse import ArgumentParser, _SubParsersAction
+
+from modelscope.cli.base import CLICommand
+from modelscope.hub.api import HubApi, ModelScopeConfig
+from modelscope.utils.constant import REPO_TYPE_MODEL, REPO_TYPE_SUPPORT
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def subparser_func(args):
+    """ Function which will be called for a specific sub parser.
+    """
+    return UploadCMD(args)
+
+
+class UploadCMD(CLICommand):
+
+    name = 'upload'
+
+    def __init__(self, args: _SubParsersAction):
+        self.args = args
+
+    @staticmethod
+    def define_args(parsers: _SubParsersAction):
+
+        parser: ArgumentParser = parsers.add_parser(UploadCMD.name)
+
+        parser.add_argument(
+            'repo_id',
+            type=str,
+            help='The ID of the repo to upload to (e.g. `username/repo-name`)')
+        parser.add_argument(
+            'local_path',
+            type=str,
+            nargs='?',
+            default=None,
+            help='Optional, '
+            'Local path to the file or folder to upload. Defaults to current directory.'
+        )
+        parser.add_argument(
+            'path_in_repo',
+            type=str,
+            nargs='?',
+            default=None,
+            help='Optional, '
+            'Path of the file or folder in the repo. Defaults to the relative path of the file or folder.'
+        )
+        parser.add_argument(
+            '--repo-type',
+            choices=REPO_TYPE_SUPPORT,
+            default=REPO_TYPE_MODEL,
+            help=
+            'Type of the repo to upload to (e.g. `dataset`, `model`). Defaults to be `model`.',
+        )
+        parser.add_argument(
+            '--include',
+            nargs='*',
+            type=str,
+            help='Glob patterns to match files to upload.')
+        parser.add_argument(
+            '--exclude',
+            nargs='*',
+            type=str,
+            help='Glob patterns to exclude from files to upload.')
+        parser.add_argument(
+            '--commit-message',
+            type=str,
+            default=None,
+            help='The message of commit. Default to be `None`.')
+        parser.add_argument(
+            '--commit-description',
+            type=str,
+            default=None,
+            help=
+            'The description of the generated commit. Default to be `None`.')
+        parser.add_argument(
+            '--token',
+            type=str,
+            default=None,
+            help=
+            'A User Access Token generated from https://modelscope.cn/my/myaccesstoken'
+        )
+        parser.add_argument(
+            '--max-workers',
+            type=int,
+            default=min(8,
+                        os.cpu_count() + 4),
+            help='The number of workers to use for uploading files.')
+        parser.add_argument(
+            '--endpoint',
+            type=str,
+            default='https://www.modelscope.cn',
+            help='Endpoint for Modelscope service.')
+
+        parser.set_defaults(func=subparser_func)
+
+    def execute(self):
+
+        assert self.args.repo_id, '`repo_id` is required'
+        assert self.args.repo_id.count(
+            '/') == 1, 'repo_id should be in format of username/repo-name'
+        repo_name: str = self.args.repo_id.split('/')[-1]
+        self.repo_id = self.args.repo_id
+
+        # Check path_in_repo
+        if self.args.local_path is None and os.path.isfile(repo_name):
+            # Case 1: modelscope upload owner_name/test_repo
+            self.local_path = repo_name
+            self.path_in_repo = repo_name
+        elif self.args.local_path is None and os.path.isdir(repo_name):
+            # Case 2: modelscope upload owner_name/test_repo  (run command line in the `repo_name` dir)
+            # => upload all files in current directory to remote root path
+            self.local_path = repo_name
+            self.path_in_repo = '.'
+        elif self.args.local_path is None:
+            # Case 3: user provided only a repo_id that does not match a local file or folder
+            # => the user must explicitly provide a local_path => raise exception
+            raise ValueError(
+                f"'{repo_name}' is not a local file or folder. Please set `local_path` explicitly."
+            )
+        elif self.args.path_in_repo is None and os.path.isfile(
+                self.args.local_path):
+            # Case 4: modelscope upload owner_name/test_repo /path/to/your_file.csv
+            # => upload it to remote root path with same name
+            self.local_path = self.args.local_path
+            self.path_in_repo = os.path.basename(self.args.local_path)
+        elif self.args.path_in_repo is None:
+            # Case 5: modelscope upload owner_name/test_repo /path/to/your_folder
+            # => upload all files in current directory to remote root path
+            self.local_path = self.args.local_path
+            self.path_in_repo = ''
+        else:
+            # Finally, if both paths are explicit
+            self.local_path = self.args.local_path
+            self.path_in_repo = self.args.path_in_repo
+
+        # Check token and login
+        # The cookies will be reused if the user has logged in before.
+        api = HubApi(endpoint=self.args.endpoint)
+
+        if self.args.token:
+            api.login(access_token=self.args.token)
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise ValueError(
+                'The `token` is not provided! '
+                'You can pass the `--token` argument, '
+                'or use api.login(access_token=`your_sdk_token`). '
+                'Your token is available at https://modelscope.cn/my/myaccesstoken'
+            )
+
+        if os.path.isfile(self.local_path):
+            commit_info = api.upload_file(
+                path_or_fileobj=self.local_path,
+                path_in_repo=self.path_in_repo,
+                repo_id=self.repo_id,
+                repo_type=self.args.repo_type,
+                commit_message=self.args.commit_message,
+                commit_description=self.args.commit_description,
+            )
+        elif os.path.isdir(self.local_path):
+            commit_info = api.upload_folder(
+                repo_id=self.repo_id,
+                folder_path=self.local_path,
+                path_in_repo=self.path_in_repo,
+                commit_message=self.args.commit_message,
+                commit_description=self.args.commit_description,
+                repo_type=self.args.repo_type,
+                allow_patterns=self.args.include,
+                ignore_patterns=self.args.exclude,
+                max_workers=self.args.max_workers,
+            )
+        else:
+            raise ValueError(f'{self.local_path} is not a valid local path')
+
+        logger.info(f'Upload finished, commit info: {commit_info}')
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 11263299..67241b68 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -3,6 +3,7 @@
 
 import datetime
 import functools
+import io
 import os
 import pickle
 import platform
@@ -13,13 +14,15 @@ from collections import defaultdict
 from http import HTTPStatus
 from http.cookiejar import CookieJar
 from os.path import expanduser
-from typing import Dict, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import Any, BinaryIO, Dict, Iterable, List, Optional, Tuple, Union
 from urllib.parse import urlencode
 
 import json
 import requests
 from requests import Session
 from requests.adapters import HTTPAdapter, Retry
+from tqdm import tqdm
 
 from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES,
                                       API_HTTP_CLIENT_TIMEOUT,
@@ -29,6 +32,7 @@ from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES,
                                       API_RESPONSE_FIELD_MESSAGE,
                                       API_RESPONSE_FIELD_USERNAME,
                                       DEFAULT_CREDENTIALS_PATH,
+                                      DEFAULT_MAX_WORKERS,
                                       MODELSCOPE_CLOUD_ENVIRONMENT,
                                       MODELSCOPE_CLOUD_USERNAME,
                                       MODELSCOPE_REQUEST_ID, ONE_YEAR_SECONDS,
@@ -36,25 +40,34 @@ from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES,
                                       TEMPORARY_FOLDER_NAME, DatasetVisibility,
                                       Licenses, ModelVisibility)
 from modelscope.hub.errors import (InvalidParameter, NotExistError,
-                                   NotLoginException, NoValidRevisionError,
-                                   RequestError, datahub_raise_on_error,
+                                   NotLoginException, RequestError,
+                                   datahub_raise_on_error,
                                    handle_http_post_error,
                                    handle_http_response, is_ok,
                                    raise_for_http_status, raise_on_error)
 from modelscope.hub.git import GitCommandWrapper
 from modelscope.hub.repository import Repository
+from modelscope.hub.utils.utils import (get_endpoint, get_readable_folder_size,
+                                        get_release_datetime,
+                                        model_id_to_group_owner_name)
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
                                        DEFAULT_REPOSITORY_REVISION,
                                        MASTER_MODEL_BRANCH, META_FILES_FORMAT,
-                                       REPO_TYPE_MODEL, ConfigFields,
+                                       REPO_TYPE_DATASET, REPO_TYPE_MODEL,
+                                       REPO_TYPE_SUPPORT, ConfigFields,
                                        DatasetFormations, DatasetMetaFormats,
                                        DatasetVisibilityMap, DownloadChannel,
                                        DownloadMode, Frameworks, ModelFile,
                                        Tasks, VirgoDatasetConfig)
+from modelscope.utils.file_utils import get_file_hash, get_file_size
 from modelscope.utils.logger import get_logger
-from .utils.utils import (get_endpoint, get_readable_folder_size,
-                          get_release_datetime, model_id_to_group_owner_name)
+from modelscope.utils.repo_utils import (DATASET_LFS_SUFFIX,
+                                         DEFAULT_IGNORE_PATTERNS,
+                                         MODEL_LFS_SUFFIX, CommitInfo,
+                                         CommitOperation, CommitOperationAdd,
+                                         RepoUtils)
+from modelscope.utils.thread_utils import thread_executor
 
 logger = get_logger()
 
@@ -93,6 +106,8 @@ class HubApi:
                     getattr(self.session, method),
                     timeout=timeout))
 
+        self.upload_checker = UploadingCheck()
+
     def login(
         self,
         access_token: str,
@@ -181,7 +196,7 @@ class HubApi:
             headers=self.builder_headers(self.headers))
         handle_http_post_error(r, path, body)
         raise_on_error(r.json())
-        model_repo_url = f'{get_endpoint()}/{model_id}'
+        model_repo_url = f'{self.endpoint}/{model_id}'
         return model_repo_url
 
     def delete_model(self, model_id: str):
@@ -1173,6 +1188,572 @@ class HubApi:
         return f'{self.endpoint}/api/v1/datasets/{_namespace}/{_dataset_name}/repo?'
         # return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='
 
+    def create_repo(
+            self,
+            repo_id: str,
+            *,
+            token: Union[str, bool, None] = None,
+            visibility: Optional[str] = 'public',
+            repo_type: Optional[str] = REPO_TYPE_MODEL,
+            chinese_name: Optional[str] = '',
+            license: Optional[str] = Licenses.APACHE_V2,
+    ) -> str:
+
+        # TODO: exist_ok
+
+        if not repo_id:
+            raise ValueError('Repo id cannot be empty!')
+
+        if token:
+            self.login(access_token=token)
+        else:
+            logger.warning('No token provided, will use the cached token.')
+
+        repo_id_list = repo_id.split('/')
+        if len(repo_id_list) != 2:
+            raise ValueError('Invalid repo id, should be in the format of `owner_name/repo_name`')
+        namespace, repo_name = repo_id_list
+
+        if repo_type == REPO_TYPE_MODEL:
+            visibilities = {k: v for k, v in ModelVisibility.__dict__.items() if not k.startswith('__')}
+            visibility: int = visibilities.get(visibility.upper())
+            if visibility is None:
+                raise ValueError(f'Invalid visibility: {visibility}, '
+                                 f'supported visibilities: `public`, `private`, `internal`')
+            repo_url: str = self.create_model(
+                model_id=repo_id,
+                visibility=visibility,
+                license=license,
+                chinese_name=chinese_name,
+            )
+
+        elif repo_type == REPO_TYPE_DATASET:
+            visibilities = {k: v for k, v in DatasetVisibility.__dict__.items() if not k.startswith('__')}
+            visibility: int = visibilities.get(visibility.upper())
+            if visibility is None:
+                raise ValueError(f'Invalid visibility: {visibility}, '
+                                 f'supported visibilities: `public`, `private`, `internal`')
+            repo_url: str = self.create_dataset(
+                dataset_name=repo_name,
+                namespace=namespace,
+                chinese_name=chinese_name,
+                license=license,
+                visibility=visibility,
+            )
+
+        else:
+            raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
+
+        return repo_url
+
+    def create_commit(
+            self,
+            repo_id: str,
+            operations: Iterable[CommitOperation],
+            *,
+            commit_message: str,
+            commit_description: Optional[str] = None,
+            token: str = None,
+            repo_type: Optional[str] = None,
+            revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
+    ) -> CommitInfo:
+
+        url = f'{self.endpoint}/api/v1/repos/{repo_type}s/{repo_id}/commit/{revision}'
+        commit_message = commit_message or f'Commit to {repo_id}'
+        commit_description = commit_description or ''
+
+        if token:
+            self.login(access_token=token)
+
+        # Construct payload
+        payload = self._prepare_commit_payload(
+            operations=operations,
+            commit_message=commit_message,
+        )
+
+        # POST
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise ValueError('Token does not exist, please login first.')
+        response = requests.post(
+            url,
+            headers=self.builder_headers(self.headers),
+            data=json.dumps(payload),
+            cookies=cookies
+        )
+
+        resp = response.json()
+
+        if not resp['Success']:
+            commit_message = resp['Message']
+            logger.warning(f'{commit_message}')
+
+        return CommitInfo(
+            commit_url=url,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            oid='',
+        )
+
+    def upload_file(
+            self,
+            *,
+            path_or_fileobj: Union[str, Path, bytes, BinaryIO],
+            path_in_repo: str,
+            repo_id: str,
+            token: Union[str, None] = None,
+            repo_type: Optional[str] = REPO_TYPE_MODEL,
+            commit_message: Optional[str] = None,
+            commit_description: Optional[str] = None,
+            buffer_size_mb: Optional[int] = 1,
+            tqdm_desc: Optional[str] = '[Uploading]',
+            disable_tqdm: Optional[bool] = False,
+    ) -> CommitInfo:
+
+        if repo_type not in REPO_TYPE_SUPPORT:
+            raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
+
+        if not path_or_fileobj:
+            raise ValueError('Path or file object cannot be empty!')
+
+        if isinstance(path_or_fileobj, (str, Path)):
+            path_or_fileobj = os.path.abspath(os.path.expanduser(path_or_fileobj))
+            path_in_repo = path_in_repo or os.path.basename(path_or_fileobj)
+
+        else:
+            # If path_or_fileobj is bytes or BinaryIO, then path_in_repo must be provided
+            if not path_in_repo:
+                raise ValueError('Arg `path_in_repo` cannot be empty!')
+
+        # Read file content if path_or_fileobj is a file-like object (BinaryIO)
+        # TODO: to be refined
+        if isinstance(path_or_fileobj, io.BufferedIOBase):
+            path_or_fileobj = path_or_fileobj.read()
+
+        self.upload_checker.check_file(path_or_fileobj)
+        self.upload_checker.check_normal_files(
+            file_path_list=[path_or_fileobj],
+            repo_type=repo_type,
+        )
+
+        if token:
+            self.login(access_token=token)
+
+        commit_message = (
+            commit_message if commit_message is not None else f'Upload {path_in_repo} to ModelScope hub'
+        )
+
+        if buffer_size_mb <= 0:
+            raise ValueError('Buffer size: `buffer_size_mb` must be greater than 0')
+
+        hash_info_d: dict = get_file_hash(
+            file_path_or_obj=path_or_fileobj,
+            buffer_size_mb=buffer_size_mb,
+        )
+        file_size: int = hash_info_d['file_size']
+        file_hash: str = hash_info_d['file_hash']
+
+        upload_res: dict = self._upload_blob(
+            repo_id=repo_id,
+            repo_type=repo_type,
+            sha256=file_hash,
+            size=file_size,
+            data=path_or_fileobj,
+            disable_tqdm=disable_tqdm,
+            tqdm_desc=tqdm_desc,
+        )
+
+        # Construct commit info and create commit
+        add_operation: CommitOperationAdd = CommitOperationAdd(
+            path_in_repo=path_in_repo,
+            path_or_fileobj=path_or_fileobj,
+        )
+        add_operation._upload_mode = 'lfs' if self.upload_checker.is_lfs(path_or_fileobj, repo_type) else 'normal'
+        add_operation._is_uploaded = upload_res['is_uploaded']
+        operations = [add_operation]
+
+        commit_info: CommitInfo = self.create_commit(
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            token=token,
+            repo_type=repo_type,
+        )
+
+        return commit_info
+
+    def upload_folder(
+            self,
+            *,
+            repo_id: str,
+            folder_path: Union[str, Path],
+            path_in_repo: Optional[str] = '',
+            commit_message: Optional[str] = None,
+            commit_description: Optional[str] = None,
+            token: Union[str, None] = None,
+            repo_type: Optional[str] = REPO_TYPE_MODEL,
+            allow_patterns: Optional[Union[List[str], str]] = None,
+            ignore_patterns: Optional[Union[List[str], str]] = None,
+            max_workers: int = DEFAULT_MAX_WORKERS,
+    ) -> CommitInfo:
+
+        if repo_type not in REPO_TYPE_SUPPORT:
+            raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
+
+        allow_patterns = allow_patterns if allow_patterns else None
+        ignore_patterns = ignore_patterns if ignore_patterns else None
+
+        self.upload_checker.check_folder(folder_path)
+
+        # Ignore .git folder
+        if ignore_patterns is None:
+            ignore_patterns = []
+        elif isinstance(ignore_patterns, str):
+            ignore_patterns = [ignore_patterns]
+        ignore_patterns += DEFAULT_IGNORE_PATTERNS
+
+        if token:
+            self.login(access_token=token)
+
+        commit_message = (
+            commit_message if commit_message is not None else f'Upload folder to {repo_id} on ModelScope hub'
+        )
+        commit_description = commit_description or 'Uploading folder'
+
+        # Get the list of files to upload, e.g. [('data/abc.png', '/path/to/abc.png'), ...]
+        prepared_repo_objects = HubApi._prepare_upload_folder(
+            folder_path=folder_path,
+            path_in_repo=path_in_repo,
+            allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
+        )
+
+        self.upload_checker.check_normal_files(
+            file_path_list = [item for _, item in prepared_repo_objects],
+            repo_type=repo_type,
+        )
+
+        @thread_executor(max_workers=max_workers, disable_tqdm=False)
+        def _upload_items(item_pair, **kwargs):
+            file_path_in_repo, file_path = item_pair
+
+            hash_info_d: dict = get_file_hash(
+                file_path_or_obj=file_path,
+            )
+            file_size: int = hash_info_d['file_size']
+            file_hash: str = hash_info_d['file_hash']
+
+            upload_res: dict = self._upload_blob(
+                repo_id=repo_id,
+                repo_type=repo_type,
+                sha256=file_hash,
+                size=file_size,
+                data=file_path,
+                disable_tqdm=False if file_size > 10 * 1024 * 1024 else True,
+            )
+
+            return {
+                'file_path_in_repo': file_path_in_repo,
+                'file_path': file_path,
+                'is_uploaded': upload_res['is_uploaded'],
+            }
+
+        uploaded_items_list = _upload_items(
+            prepared_repo_objects,
+            repo_id=repo_id,
+            token=token,
+            repo_type=repo_type,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            buffer_size_mb=1,
+            tqdm_desc='[Uploading]',
+            disable_tqdm=False,
+        )
+
+        logger.info(f'Uploading folder to {repo_id} finished')
+
+        # Construct commit info and create commit
+        operations = []
+
+        for item_d in uploaded_items_list:
+            prepared_path_in_repo: str = item_d['file_path_in_repo']
+            prepared_file_path: str = item_d['file_path']
+            is_uploaded: bool = item_d['is_uploaded']
+            opt = CommitOperationAdd(
+                path_in_repo=prepared_path_in_repo,
+                path_or_fileobj=prepared_file_path,
+            )
+
+            # check normal or lfs
+            opt._upload_mode = 'lfs' if self.upload_checker.is_lfs(prepared_file_path, repo_type) else 'normal'
+            opt._is_uploaded = is_uploaded
+            operations.append(opt)
+
+        self.create_commit(
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            token=token,
+            repo_type=repo_type,
+        )
+
+        # Construct commit info
+        commit_url = f'{self.endpoint}/api/v1/{repo_type}s/{repo_id}/commit/{DEFAULT_REPOSITORY_REVISION}'
+        return CommitInfo(
+            commit_url=commit_url,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            oid='')
+
+    def _upload_blob(
+            self,
+            *,
+            repo_id: str,
+            repo_type: str,
+            sha256: str,
+            size: int,
+            data: Union[str, Path, bytes, BinaryIO],
+            disable_tqdm: Optional[bool] = False,
+            tqdm_desc: Optional[str] = '[Uploading]',
+            buffer_size_mb: Optional[int] = 1,
+    ) -> dict:
+
+        res_d: dict = dict(
+            url=None,
+            is_uploaded=False,
+            status_code=None,
+            status_msg=None,
+        )
+
+        objects = [{'oid': sha256, 'size': size}]
+        upload_objects = self._validate_blob(
+            repo_id=repo_id,
+            repo_type=repo_type,
+            objects=objects,
+        )
+
+        # upload_object: {'url': 'xxx', 'oid': 'xxx'}
+        upload_object = upload_objects[0] if len(upload_objects) == 1 else None
+
+        if upload_object is None:
+            logger.info(f'Blob {sha256} has already uploaded, reuse it.')
+            res_d['is_uploaded'] = True
+            return res_d
+
+        cookies = ModelScopeConfig.get_cookies()
+        cookies = dict(cookies) if cookies else None
+        if cookies is None:
+            raise ValueError('Token does not exist, please login first.')
+
+        self.headers.update({'Cookie': f"m_session_id={cookies['m_session_id']}"})
+        headers = self.builder_headers(self.headers)
+
+        def read_in_chunks(file_object, pbar, chunk_size=buffer_size_mb * 1024 * 1024):
+            """Lazy function (generator) to read a file piece by piece."""
+            while True:
+                ck = file_object.read(chunk_size)
+                if not ck:
+                    break
+                pbar.update(len(ck))
+                yield ck
+
+        with tqdm(
+                total=size,
+                unit='B',
+                unit_scale=True,
+                desc=tqdm_desc,
+                disable=disable_tqdm
+        ) as pbar:
+
+            if isinstance(data, (str, Path)):
+                with open(data, 'rb') as f:
+                    response = requests.put(
+                        upload_object['url'],
+                        headers=headers,
+                        data=read_in_chunks(f, pbar)
+                    )
+
+            elif isinstance(data, bytes):
+                response = requests.put(
+                    upload_object['url'],
+                    headers=headers,
+                    data=read_in_chunks(io.BytesIO(data), pbar)
+                )
+
+            elif isinstance(data, io.BufferedIOBase):
+                response = requests.put(
+                    upload_object['url'],
+                    headers=headers,
+                    data=read_in_chunks(data, pbar)
+                )
+
+            else:
+                raise ValueError('Invalid data type to upload')
+
+        resp = response.json()
+        raise_on_error(resp)
+
+        res_d['url'] = upload_object['url']
+        res_d['status_code'] = resp['Code']
+        res_d['status_msg'] = resp['Message']
+
+        return res_d
+
+    def _validate_blob(
+            self,
+            *,
+            repo_id: str,
+            repo_type: str,
+            objects: List[Dict[str, Any]],
+    ) -> List[Dict[str, Any]]:
+        """
+        Check the blob has already uploaded.
+        True -- uploaded; False -- not uploaded.
+
+        Args:
+            repo_id (str): The repo id ModelScope.
+            repo_type (str): The repo type. `dataset`, `model`, etc.
+            objects (List[Dict[str, Any]]): The objects to check.
+                oid (str): The sha256 hash value.
+                size (int): The size of the blob.
+
+        Returns:
+            List[Dict[str, Any]]: The result of the check.
+        """
+
+        # construct URL
+        url = f'{self.endpoint}/api/v1/repos/{repo_type}s/{repo_id}/info/lfs/objects/batch'
+
+        # build payload
+        payload = {
+            'operation': 'upload',
+            'objects': objects,
+        }
+
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise ValueError('Token does not exist, please login first.')
+        response = requests.post(
+            url,
+            headers=self.builder_headers(self.headers),
+            data=json.dumps(payload),
+            cookies=cookies
+        )
+
+        resp = response.json()
+        raise_on_error(resp)
+
+        upload_objects = []   # list of objects to upload, [{'url': 'xxx', 'oid': 'xxx'}, ...]
+        resp_objects = resp['Data']['objects']
+        for obj in resp_objects:
+            upload_objects.append(
+                {'url': obj['actions']['upload']['href'],
+                 'oid': obj['oid']}
+            )
+
+        return upload_objects
+
+    @staticmethod
+    def _prepare_upload_folder(
+        folder_path: Union[str, Path],
+        path_in_repo: str,
+        allow_patterns: Optional[Union[List[str], str]] = None,
+        ignore_patterns: Optional[Union[List[str], str]] = None,
+    ) -> List[Union[tuple, list]]:
+
+        folder_path = Path(folder_path).expanduser().resolve()
+        if not folder_path.is_dir():
+            raise ValueError(f"Provided path: '{folder_path}' is not a directory")
+
+        # List files from folder
+        relpath_to_abspath = {
+            path.relative_to(folder_path).as_posix(): path
+            for path in sorted(folder_path.glob('**/*'))  # sorted to be deterministic
+            if path.is_file()
+        }
+
+        # Filter files
+        filtered_repo_objects = list(
+            RepoUtils.filter_repo_objects(
+                relpath_to_abspath.keys(), allow_patterns=allow_patterns, ignore_patterns=ignore_patterns
+            )
+        )
+
+        prefix = f"{path_in_repo.strip('/')}/" if path_in_repo else ''
+
+        prepared_repo_objects = [
+            (prefix + relpath, str(relpath_to_abspath[relpath]))
+            for relpath in filtered_repo_objects
+        ]
+
+        return prepared_repo_objects
+
+    @staticmethod
+    def _prepare_commit_payload(
+            operations: Iterable[CommitOperation],
+            commit_message: str,
+    ) -> Dict[str, Any]:
+        """
+        Prepare the commit payload to be sent to the ModelScope hub.
+        """
+
+        payload = {
+            'commit_message': commit_message,
+            'actions': []
+        }
+
+        nb_ignored_files = 0
+
+        # 2. Send operations, one per line
+        for operation in operations:
+
+            # Skip ignored files
+            if isinstance(operation, CommitOperationAdd) and operation._should_ignore:
+                logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).")
+                nb_ignored_files += 1
+                continue
+
+            # 2.a. Case adding a normal file
+            if isinstance(operation, CommitOperationAdd) and operation._upload_mode == 'normal':
+
+                commit_action = {
+                    'action': 'update' if operation._is_uploaded else 'create',
+                    'path': operation.path_in_repo,
+                    'type': 'normal',
+                    'size': operation.upload_info.size,
+                    'sha256': '',
+                    'content': operation.b64content().decode(),
+                    'encoding': 'base64',
+                }
+                payload['actions'].append(commit_action)
+
+            # 2.b. Case adding an LFS file
+            elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == 'lfs':
+
+                commit_action = {
+                    'action': 'update' if operation._is_uploaded else 'create',
+                    'path': operation.path_in_repo,
+                    'type': 'lfs',
+                    'size': operation.upload_info.size,
+                    'sha256': operation.upload_info.sha256,
+                    'content': '',
+                    'encoding': '',
+                }
+                payload['actions'].append(commit_action)
+
+            else:
+                raise ValueError(
+                    f'Unknown operation to commit. Operation: {operation}. Upload mode:'
+                    f" {getattr(operation, '_upload_mode', None)}"
+                )
+
+        if nb_ignored_files > 0:
+            logger.info(f'Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).')
+
+        return payload
+
 
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
@@ -1316,3 +1897,85 @@ class ModelScopeConfig:
         elif isinstance(user_agent, str):
             ua += '; ' + user_agent
         return ua
+
+
+class UploadingCheck:
+    def __init__(
+            self,
+            max_file_count: int = 100_000,
+            max_file_count_in_dir: int = 10_000,
+            max_file_size: int = 50 * 1024 ** 3,
+            lfs_size_limit: int = 5 * 1024 * 1024,
+            normal_file_size_total_limit: int = 500 * 1024 * 1024,
+    ):
+        self.max_file_count = max_file_count
+        self.max_file_count_in_dir = max_file_count_in_dir
+        self.max_file_size = max_file_size
+        self.lfs_size_limit = lfs_size_limit
+        self.normal_file_size_total_limit = normal_file_size_total_limit
+
+    def check_file(self, file_path_or_obj):
+
+        if isinstance(file_path_or_obj, (str, Path)):
+            if not os.path.exists(file_path_or_obj):
+                raise ValueError(f'File {file_path_or_obj} does not exist')
+
+        file_size: int = get_file_size(file_path_or_obj)
+        if file_size > self.max_file_size:
+            raise ValueError(f'File exceeds size limit: {self.max_file_size / (1024 ** 3)} GB')
+
+    def check_folder(self, folder_path: Union[str, Path]):
+        file_count = 0
+        dir_count = 0
+
+        if isinstance(folder_path, str):
+            folder_path = Path(folder_path)
+
+        for item in folder_path.iterdir():
+            if item.is_file():
+                file_count += 1
+            elif item.is_dir():
+                dir_count += 1
+                # Count items in subdirectories recursively
+                sub_file_count, sub_dir_count = self.check_folder(item)
+                if (sub_file_count + sub_dir_count) > self.max_file_count_in_dir:
+                    raise ValueError(f'Directory {item} contains {sub_file_count + sub_dir_count} items '
+                                     f'and exceeds limit: {self.max_file_count_in_dir}')
+                file_count += sub_file_count
+                dir_count += sub_dir_count
+
+        if file_count > self.max_file_count:
+            raise ValueError(f'Total file count {file_count} and exceeds limit: {self.max_file_count}')
+
+        return file_count, dir_count
+
+    def is_lfs(self, file_path_or_obj: Union[str, Path, bytes, BinaryIO], repo_type: str) -> bool:
+
+        hit_lfs_suffix = True
+
+        if isinstance(file_path_or_obj, (str, Path)):
+            file_path_or_obj = Path(file_path_or_obj)
+            if not file_path_or_obj.exists():
+                raise ValueError(f'File {file_path_or_obj} does not exist')
+
+            if repo_type == REPO_TYPE_MODEL:
+                if file_path_or_obj.suffix not in MODEL_LFS_SUFFIX:
+                    hit_lfs_suffix = False
+            elif repo_type == REPO_TYPE_DATASET:
+                if file_path_or_obj.suffix not in DATASET_LFS_SUFFIX:
+                    hit_lfs_suffix = False
+            else:
+                raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
+
+        file_size: int = get_file_size(file_path_or_obj)
+
+        return file_size > self.lfs_size_limit or hit_lfs_suffix
+
+    def check_normal_files(self, file_path_list: List[Union[str, Path]], repo_type: str) -> None:
+
+        normal_file_list = [item for item in file_path_list if not self.is_lfs(item, repo_type)]
+        total_size = sum([get_file_size(item) for item in normal_file_list])
+
+        if total_size > self.normal_file_size_total_limit:
+            raise ValueError(f'Total size of non-lfs files {total_size/(1024 * 1024)}MB '
+                             f'and exceeds limit: {self.normal_file_size_total_limit/(1024 * 1024)}MB')
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 1b3b3593..e14d5838 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
 import os
 from pathlib import Path
 
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index dbe15171..21599a1b 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -6,7 +6,8 @@ from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
 
 import numpy as np
-from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
+from datasets import (Dataset, DatasetDict, Features, IterableDataset,
+                      IterableDatasetDict)
 from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
 from datasets.utils.file_utils import is_relative_path
 
@@ -163,6 +164,7 @@ class MsDataset:
         download_mode: Optional[DownloadMode] = DownloadMode.
         REUSE_DATASET_IF_EXISTS,
         cache_dir: Optional[str] = MS_DATASETS_CACHE,
+        features: Optional[Features] = None,
         use_streaming: Optional[bool] = False,
         stream_batch_size: Optional[int] = 1,
         custom_cfg: Optional[Config] = Config(),
@@ -305,7 +307,7 @@ class MsDataset:
                         data_files=data_files,
                         split=split,
                         cache_dir=cache_dir,
-                        features=None,
+                        features=features,
                         download_config=None,
                         download_mode=download_mode.value,
                         revision=version,
@@ -334,6 +336,9 @@ class MsDataset:
                 return dataset_inst
 
         elif hub == Hubs.virgo:
+            warnings.warn(
+                'The option `Hubs.virgo` is deprecated, '
+                'will be removed in the future version.', DeprecationWarning)
             from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader
             from modelscope.utils.constant import VirgoDatasetConfig
             # Rewrite the namespace, version and cache_dir for virgo dataset.
@@ -395,8 +400,10 @@ class MsDataset:
 
         """
         warnings.warn(
-            'upload is deprecated, please use git command line to upload the dataset.',
-            DeprecationWarning)
+            'The function `upload` is deprecated, '
+            'please use git command '
+            'or modelscope.hub.api.HubApi.upload_folder '
+            'or modelscope.hub.api.HubApi.upload_file.', DeprecationWarning)
 
         if not object_name:
             raise ValueError('object_name cannot be empty!')
@@ -446,7 +453,7 @@ class MsDataset:
         """
 
         warnings.warn(
-            'upload is deprecated, please use git command line to upload the dataset.',
+            'The function `clone_meta` is deprecated, please use git command line to clone the repo.',
             DeprecationWarning)
 
         _repo = DatasetRepository(
@@ -487,6 +494,12 @@ class MsDataset:
             None
 
         """
+        warnings.warn(
+            'The function `upload_meta` is deprecated, '
+            'please use git command '
+            'or CLI `modelscope upload owner_name/repo_name ...`.',
+            DeprecationWarning)
+
         _repo = DatasetRepository(
             repo_work_dir=dataset_work_dir,
             dataset_id='',
diff --git a/modelscope/utils/file_utils.py b/modelscope/utils/file_utils.py
index 50f66e34..4a86d4bf 100644
--- a/modelscope/utils/file_utils.py
+++ b/modelscope/utils/file_utils.py
@@ -1,9 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import hashlib
 import inspect
+import io
 import os
 from pathlib import Path
 from shutil import Error, copy2, copystat
+from typing import BinaryIO, Optional, Union
 
 
 # TODO: remove this api, unify to flattened args
@@ -180,3 +182,85 @@ def copytree_py37(src,
     if errors:
         raise Error(errors)
     return dst
+
+
+def get_file_size(file_path_or_obj: Union[str, Path, bytes, BinaryIO]) -> int:
+    if isinstance(file_path_or_obj, (str, Path)):
+        file_path = Path(file_path_or_obj)
+        return file_path.stat().st_size
+    elif isinstance(file_path_or_obj, bytes):
+        return len(file_path_or_obj)
+    elif isinstance(file_path_or_obj, io.BufferedIOBase):
+        current_position = file_path_or_obj.tell()
+        file_path_or_obj.seek(0, os.SEEK_END)
+        size = file_path_or_obj.tell()
+        file_path_or_obj.seek(current_position)
+        return size
+    else:
+        raise TypeError(
+            'Unsupported type: must be string, Path, bytes, or io.BufferedIOBase'
+        )
+
+
+def get_file_hash(
+    file_path_or_obj: Union[str, Path, bytes, BinaryIO],
+    buffer_size_mb: Optional[int] = 1,
+    tqdm_desc: Optional[str] = '[Calculating]',
+    disable_tqdm: Optional[bool] = True,
+) -> dict:
+    from tqdm import tqdm
+
+    file_size = get_file_size(file_path_or_obj)
+    buffer_size = buffer_size_mb * 1024 * 1024
+    file_hash = hashlib.sha256()
+    chunk_hash_list = []
+
+    progress = tqdm(
+        total=file_size,
+        initial=0,
+        unit_scale=True,
+        dynamic_ncols=True,
+        unit='B',
+        desc=tqdm_desc,
+        disable=disable_tqdm,
+    )
+
+    if isinstance(file_path_or_obj, (str, Path)):
+        with open(file_path_or_obj, 'rb') as f:
+            while byte_chunk := f.read(buffer_size):
+                chunk_hash_list.append(hashlib.sha256(byte_chunk).hexdigest())
+                file_hash.update(byte_chunk)
+                progress.update(len(byte_chunk))
+        file_hash = file_hash.hexdigest()
+        final_chunk_size = buffer_size
+
+    elif isinstance(file_path_or_obj, bytes):
+        file_hash.update(file_path_or_obj)
+        file_hash = file_hash.hexdigest()
+        chunk_hash_list.append(file_hash)
+        final_chunk_size = len(file_path_or_obj)
+        progress.update(final_chunk_size)
+
+    elif isinstance(file_path_or_obj, io.BufferedIOBase):
+        while byte_chunk := file_path_or_obj.read(buffer_size):
+            chunk_hash_list.append(hashlib.sha256(byte_chunk).hexdigest())
+            file_hash.update(byte_chunk)
+            progress.update(len(byte_chunk))
+        file_hash = file_hash.hexdigest()
+        final_chunk_size = buffer_size
+
+    else:
+        progress.close()
+        raise ValueError(
+            'Input must be str, Path, bytes or a io.BufferedIOBase')
+
+    progress.close()
+
+    return {
+        'file_path_or_obj': file_path_or_obj,
+        'file_hash': file_hash,
+        'file_size': file_size,
+        'chunk_size': final_chunk_size,
+        'chunk_nums': len(chunk_hash_list),
+        'chunk_hash_list': chunk_hash_list,
+    }
diff --git a/modelscope/utils/repo_utils.py b/modelscope/utils/repo_utils.py
new file mode 100644
index 00000000..40fe4f1e
--- /dev/null
+++ b/modelscope/utils/repo_utils.py
@@ -0,0 +1,479 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2022-present, the HuggingFace Inc. team.
+import base64
+import functools
+import hashlib
+import io
+import os
+import sys
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from fnmatch import fnmatch
+from pathlib import Path
+from typing import (BinaryIO, Callable, Generator, Iterable, Iterator, List,
+                    Literal, Optional, TypeVar, Union)
+
+from modelscope.utils.file_utils import get_file_hash
+
+T = TypeVar('T')
+# Always ignore `.git` and `.cache/modelscope` folders in commits
+DEFAULT_IGNORE_PATTERNS = [
+    '.git',
+    '.git/*',
+    '*/.git',
+    '**/.git/**',
+    '.cache/modelscope',
+    '.cache/modelscope/*',
+    '*/.cache/modelscope',
+    '**/.cache/modelscope/**',
+]
+# Forbidden to commit these folders
+FORBIDDEN_FOLDERS = ['.git', '.cache']
+
+UploadMode = Literal['lfs', 'normal']
+
+DATASET_LFS_SUFFIX = [
+    '.7z',
+    '.aac',
+    '.arrow',
+    '.audio',
+    '.bmp',
+    '.bin',
+    '.bz2',
+    '.flac',
+    '.ftz',
+    '.gif',
+    '.gz',
+    '.h5',
+    '.jack',
+    '.jpeg',
+    '.jpg',
+    '.jsonl',
+    '.joblib',
+    '.lz4',
+    '.msgpack',
+    '.npy',
+    '.npz',
+    '.ot',
+    '.parquet',
+    '.pb',
+    '.pickle',
+    '.pcm',
+    '.pkl',
+    '.raw',
+    '.rar',
+    '.sam',
+    '.tar',
+    '.tgz',
+    '.wasm',
+    '.wav',
+    '.webm',
+    '.webp',
+    '.zip',
+    '.zst',
+    '.tiff',
+    '.mp3',
+    '.mp4',
+    '.ogg',
+]
+
+MODEL_LFS_SUFFIX = [
+    '.7z',
+    '.arrow',
+    '.bin',
+    '.bz2',
+    '.ckpt',
+    '.ftz',
+    '.gz',
+    '.h5',
+    '.joblib',
+    '.mlmodel',
+    '.model',
+    '.msgpack',
+    '.npy',
+    '.npz',
+    '.onnx',
+    '.ot',
+    '.parquet',
+    '.pb',
+    '.pickle',
+    '.pkl',
+    '.pt',
+    '.pth',
+    '.rar',
+    '.safetensors',
+    '.tar',
+    '.tflite',
+    '.tgz',
+    '.wasm',
+    '.xz',
+    '.zip',
+    '.zst',
+]
+
+
+class RepoUtils:
+
+    @staticmethod
+    def filter_repo_objects(
+        items: Iterable[T],
+        *,
+        allow_patterns: Optional[Union[List[str], str]] = None,
+        ignore_patterns: Optional[Union[List[str], str]] = None,
+        key: Optional[Callable[[T], str]] = None,
+    ) -> Generator[T, None, None]:
+        """Filter repo objects based on an allowlist and a denylist.
+
+        Input must be a list of paths (`str` or `Path`) or a list of arbitrary objects.
+        In the later case, `key` must be provided and specifies a function of one argument
+        that is used to extract a path from each element in iterable.
+
+        Patterns are Unix shell-style wildcards which are NOT regular expressions. See
+        https://docs.python.org/3/library/fnmatch.html for more details.
+
+        Args:
+            items (`Iterable`):
+                List of items to filter.
+            allow_patterns (`str` or `List[str]`, *optional*):
+                Patterns constituting the allowlist. If provided, item paths must match at
+                least one pattern from the allowlist.
+            ignore_patterns (`str` or `List[str]`, *optional*):
+                Patterns constituting the denylist. If provided, item paths must not match
+                any patterns from the denylist.
+            key (`Callable[[T], str]`, *optional*):
+                Single-argument function to extract a path from each item. If not provided,
+                the `items` must already be `str` or `Path`.
+
+        Returns:
+            Filtered list of objects, as a generator.
+
+        Raises:
+            :class:`ValueError`:
+                If `key` is not provided and items are not `str` or `Path`.
+
+        Example usage with paths:
+        ```python
+        >>> # Filter only PDFs that are not hidden.
+        >>> list(RepoUtils.filter_repo_objects(
+        ...     ["aaa.PDF", "bbb.jpg", ".ccc.pdf", ".ddd.png"],
+        ...     allow_patterns=["*.pdf"],
+        ...     ignore_patterns=[".*"],
+        ... ))
+        ["aaa.pdf"]
+        ```
+        """
+
+        allow_patterns = allow_patterns if allow_patterns else None
+        ignore_patterns = ignore_patterns if ignore_patterns else None
+
+        if isinstance(allow_patterns, str):
+            allow_patterns = [allow_patterns]
+
+        if isinstance(ignore_patterns, str):
+            ignore_patterns = [ignore_patterns]
+
+        if allow_patterns is not None:
+            allow_patterns = [
+                RepoUtils._add_wildcard_to_directories(p)
+                for p in allow_patterns
+            ]
+        if ignore_patterns is not None:
+            ignore_patterns = [
+                RepoUtils._add_wildcard_to_directories(p)
+                for p in ignore_patterns
+            ]
+
+        if key is None:
+
+            def _identity(item: T) -> str:
+                if isinstance(item, str):
+                    return item
+                if isinstance(item, Path):
+                    return str(item)
+                raise ValueError(
+                    f'Please provide `key` argument in `filter_repo_objects`: `{item}` is not a string.'
+                )
+
+            key = _identity  # Items must be `str` or `Path`, otherwise raise ValueError
+
+        for item in items:
+            path = key(item)
+
+            # Skip if there's an allowlist and path doesn't match any
+            if allow_patterns is not None and not any(
+                    fnmatch(path, r) for r in allow_patterns):
+                continue
+
+            # Skip if there's a denylist and path matches any
+            if ignore_patterns is not None and any(
+                    fnmatch(path, r) for r in ignore_patterns):
+                continue
+
+            yield item
+
+    @staticmethod
+    def _add_wildcard_to_directories(pattern: str) -> str:
+        if pattern[-1] == '/':
+            return pattern + '*'
+        return pattern
+
+
+@dataclass
+class CommitInfo(str):
+    """Data structure containing information about a newly created commit.
+
+    Returned by any method that creates a commit on the Hub: [`create_commit`], [`upload_file`], [`upload_folder`],
+    [`delete_file`], [`delete_folder`]. It inherits from `str` for backward compatibility but using methods specific
+    to `str` is deprecated.
+
+    Attributes:
+        commit_url (`str`):
+            Url where to find the commit.
+
+        commit_message (`str`):
+            The summary (first line) of the commit that has been created.
+
+        commit_description (`str`):
+            Description of the commit that has been created. Can be empty.
+
+        oid (`str`):
+            Commit hash id. Example: `"91c54ad1727ee830252e457677f467be0bfd8a57"`.
+
+        pr_url (`str`, *optional*):
+            Url to the PR that has been created, if any. Populated when `create_pr=True`
+            is passed.
+
+        pr_revision (`str`, *optional*):
+            Revision of the PR that has been created, if any. Populated when
+            `create_pr=True` is passed. Example: `"refs/pr/1"`.
+
+        pr_num (`int`, *optional*):
+            Number of the PR discussion that has been created, if any. Populated when
+            `create_pr=True` is passed. Can be passed as `discussion_num` in
+            [`get_discussion_details`]. Example: `1`.
+
+        _url (`str`, *optional*):
+            Legacy url for `str` compatibility. Can be the url to the uploaded file on the Hub (if returned by
+            [`upload_file`]), to the uploaded folder on the Hub (if returned by [`upload_folder`]) or to the commit on
+            the Hub (if returned by [`create_commit`]). Defaults to `commit_url`. It is deprecated to use this
+            attribute. Please use `commit_url` instead.
+    """
+
+    commit_url: str
+    commit_message: str
+    commit_description: str
+    oid: str
+    pr_url: Optional[str] = None
+
+    # Computed from `pr_url` in `__post_init__`
+    pr_revision: Optional[str] = field(init=False)
+    pr_num: Optional[str] = field(init=False)
+
+    # legacy url for `str` compatibility (ex: url to uploaded file, url to uploaded folder, url to PR, etc.)
+    _url: str = field(
+        repr=False, default=None)  # type: ignore  # defaults to `commit_url`
+
+    def __new__(cls,
+                *args,
+                commit_url: str,
+                _url: Optional[str] = None,
+                **kwargs):
+        return str.__new__(cls, _url or commit_url)
+
+    def to_dict(cls):
+        return {
+            'commit_url': cls.commit_url,
+            'commit_message': cls.commit_message,
+            'commit_description': cls.commit_description,
+            'oid': cls.oid,
+            'pr_url': cls.pr_url,
+        }
+
+
+def git_hash(data: bytes) -> str:
+    """
+    Computes the git-sha1 hash of the given bytes, using the same algorithm as git.
+
+    This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
+    for more details.
+
+    Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
+          pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
+          the LFS file content when we want to compare LFS files.
+
+    Args:
+        data (`bytes`):
+            The data to compute the git-hash for.
+
+    Returns:
+        `str`: the git-hash of `data` as an hexadecimal string.
+    """
+    _kwargs = {'usedforsecurity': False} if sys.version_info >= (3, 9) else {}
+    sha1 = functools.partial(hashlib.sha1, **_kwargs)
+    sha = sha1()
+    sha.update(b'blob ')
+    sha.update(str(len(data)).encode())
+    sha.update(b'\0')
+    sha.update(data)
+    return sha.hexdigest()
+
+
+@dataclass
+class UploadInfo:
+    """
+    Dataclass holding required information to determine whether a blob
+    should be uploaded to the hub using the LFS protocol or the regular protocol
+
+    Args:
+        sha256 (`str`):
+            SHA256 hash of the blob
+        size (`int`):
+            Size in bytes of the blob
+        sample (`bytes`):
+            First 512 bytes of the blob
+    """
+
+    sha256: str
+    size: int
+    sample: bytes
+
+    @classmethod
+    def from_path(cls, path: str):
+
+        file_hash_info: dict = get_file_hash(path)
+        size = file_hash_info['file_size']
+        sha = file_hash_info['file_hash']
+        sample = open(path, 'rb').read(512)
+
+        return cls(sha256=sha, size=size, sample=sample)
+
+    @classmethod
+    def from_bytes(cls, data: bytes):
+        sha = get_file_hash(data)['file_hash']
+        return cls(size=len(data), sample=data[:512], sha256=sha)
+
+    @classmethod
+    def from_fileobj(cls, fileobj: BinaryIO):
+        fileobj_info: dict = get_file_hash(fileobj)
+        sample = fileobj.read(512)
+        return cls(
+            sha256=fileobj_info['file_hash'],
+            size=fileobj_info['file_size'],
+            sample=sample)
+
+
+@dataclass
+class CommitOperationAdd:
+    """Data structure containing information about a file to be added to a commit."""
+
+    path_in_repo: str
+    path_or_fileobj: Union[str, Path, bytes, BinaryIO]
+    upload_info: UploadInfo = field(init=False, repr=False)
+
+    # Internal attributes
+
+    # set to "lfs" or "regular" once known
+    _upload_mode: Optional[UploadMode] = field(
+        init=False, repr=False, default=None)
+
+    # set to True if .gitignore rules prevent the file from being uploaded as LFS
+    # (server-side check)
+    _should_ignore: Optional[bool] = field(
+        init=False, repr=False, default=None)
+
+    # set to the remote OID of the file if it has already been uploaded
+    # useful to determine if a commit will be empty or not
+    _remote_oid: Optional[str] = field(init=False, repr=False, default=None)
+
+    # set to True once the file has been uploaded as LFS
+    _is_uploaded: bool = field(init=False, repr=False, default=False)
+
+    # set to True once the file has been committed
+    _is_committed: bool = field(init=False, repr=False, default=False)
+
+    def __post_init__(self) -> None:
+        """Validates `path_or_fileobj` and compute `upload_info`."""
+
+        # Validate `path_or_fileobj` value
+        if isinstance(self.path_or_fileobj, Path):
+            self.path_or_fileobj = str(self.path_or_fileobj)
+        if isinstance(self.path_or_fileobj, str):
+            path_or_fileobj = os.path.normpath(
+                os.path.expanduser(self.path_or_fileobj))
+            if not os.path.isfile(path_or_fileobj):
+                raise ValueError(
+                    f"Provided path: '{path_or_fileobj}' is not a file on the local file system"
+                )
+        elif not isinstance(self.path_or_fileobj, (io.BufferedIOBase, bytes)):
+            raise ValueError(
+                'path_or_fileobj must be either an instance of str, bytes or'
+                ' io.BufferedIOBase. If you passed a file-like object, make sure it is'
+                ' in binary mode.')
+        if isinstance(self.path_or_fileobj, io.BufferedIOBase):
+            try:
+                self.path_or_fileobj.tell()
+                self.path_or_fileobj.seek(0, os.SEEK_CUR)
+            except (OSError, AttributeError) as exc:
+                raise ValueError(
+                    'path_or_fileobj is a file-like object but does not implement seek() and tell()'
+                ) from exc
+
+        # Compute "upload_info" attribute
+        if isinstance(self.path_or_fileobj, str):
+            self.upload_info = UploadInfo.from_path(self.path_or_fileobj)
+        elif isinstance(self.path_or_fileobj, bytes):
+            self.upload_info = UploadInfo.from_bytes(self.path_or_fileobj)
+        else:
+            self.upload_info = UploadInfo.from_fileobj(self.path_or_fileobj)
+
+    @contextmanager
+    def as_file(self) -> Iterator[BinaryIO]:
+        """
+        A context manager that yields a file-like object allowing to read the underlying
+        data behind `path_or_fileobj`.
+        """
+        if isinstance(self.path_or_fileobj, str) or isinstance(
+                self.path_or_fileobj, Path):
+            with open(self.path_or_fileobj, 'rb') as file:
+                yield file
+        elif isinstance(self.path_or_fileobj, bytes):
+            yield io.BytesIO(self.path_or_fileobj)
+        elif isinstance(self.path_or_fileobj, io.BufferedIOBase):
+            prev_pos = self.path_or_fileobj.tell()
+            yield self.path_or_fileobj
+            self.path_or_fileobj.seek(prev_pos, 0)
+
+    def b64content(self) -> bytes:
+        """
+        The base64-encoded content of `path_or_fileobj`
+
+        Returns: `bytes`
+        """
+        with self.as_file() as file:
+            return base64.b64encode(file.read())
+
+    @property
+    def _local_oid(self) -> Optional[str]:
+        """Return the OID of the local file.
+
+        This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one.
+        If the file did not change, we won't upload it again to prevent empty commits.
+
+        For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref).
+        For regular files, the OID corresponds to the SHA1 of the file content.
+        Note: this is slightly different to git OID computation since the oid of an LFS file is usually the git-SHA1
+            of the pointer file content (not the actual file content). However, using the SHA256 is enough to detect
+            changes and more convenient client-side.
+        """
+        if self._upload_mode is None:
+            return None
+        elif self._upload_mode == 'lfs':
+            return self.upload_info.sha256
+        else:
+            # Regular file => compute sha1
+            # => no need to read by chunk since the file is guaranteed to be <=5MB.
+            with self.as_file() as file:
+                return git_hash(file.read())
+
+
+CommitOperation = Union[CommitOperationAdd, ]
diff --git a/modelscope/utils/thread_utils.py b/modelscope/utils/thread_utils.py
index 1417962e..41aefbe4 100644
--- a/modelscope/utils/thread_utils.py
+++ b/modelscope/utils/thread_utils.py
@@ -12,13 +12,15 @@ logger = get_logger()
 
 
 def thread_executor(max_workers: int = DEFAULT_MAX_WORKERS,
-                    disable_tqdm=False):
+                    disable_tqdm: bool = False,
+                    tqdm_desc: str = None):
     """
     A decorator to execute a function in a threaded manner using ThreadPoolExecutor.
 
     Args:
         max_workers (int): The maximum number of threads to use.
         disable_tqdm (bool): disable progress bar.
+        tqdm_desc (str): Desc of tqdm.
 
     Returns:
         function: A wrapped function that executes with threading and a progress bar.
@@ -43,8 +45,11 @@ def thread_executor(max_workers: int = DEFAULT_MAX_WORKERS,
             results = []
             # Create a tqdm progress bar with the total number of items to process
             with tqdm(
+                    unit_scale=True,
+                    unit_divisor=1024,
+                    initial=0,
                     total=len(iterable),
-                    desc=f'Processing {len(iterable)} items',
+                    desc=tqdm_desc or f'Processing {len(iterable)} items',
                     disable=disable_tqdm,
             ) as pbar:
                 # Define a wrapper function to update the progress bar