From 469159de34ce92da3fb86abd2b9640c0ed153a3d Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingdachen@apache.org>
Date: Wed, 11 Sep 2024 11:40:37 +0800
Subject: [PATCH 1/5] use tqdm auto (#982)

Co-authored-by: Yingda Chen <yingda.chen@alibaba-inc.com>
---
 modelscope/hub/api.py           | 2 +-
 modelscope/hub/file_download.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 99eccd16..afa5cf8e 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -834,7 +834,7 @@ class HubApi:
         Fetch the meta-data files from the url, e.g. csv/jsonl files.
         """
         import hashlib
-        from tqdm import tqdm
+        from tqdm.auto import tqdm
         import pandas as pd
 
         out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest())
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 7bbc49e1..50b9e8cb 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -14,7 +14,7 @@ from typing import Dict, Optional, Union
 
 import requests
 from requests.adapters import Retry
-from tqdm import tqdm
+from tqdm.auto import tqdm
 
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.hub.constants import (

From 51b33cecefed4daad3dccc47e0da60d5923ce8de Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Wed, 11 Sep 2024 19:35:02 +0800
Subject: [PATCH 2/5] Support create file with size 0 (#984)

* Support file size == 0
---
 modelscope/hub/file_download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 50b9e8cb..542c42af 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -471,7 +471,7 @@ def http_get_model_file(
                 with open(temp_file_path, 'rb') as f:
                     partial_length = f.seek(0, io.SEEK_END)
                     progress.update(partial_length)
-            if partial_length >= file_size:
+            if partial_length >= file_size > 0:
                 break
             # closed range[], from 0.
             get_headers['Range'] = 'bytes=%s-%s' % (partial_length,

From 4c518db4246507155e8be381685df419ced3b2b5 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Sat, 14 Sep 2024 12:24:19 +0800
Subject: [PATCH 3/5] patch hf hub (#987)

---
 modelscope/hub/api.py       |  20 +++++
 modelscope/utils/hf_util.py | 161 +++++++++++++++++++++++++++++++++++-
 requirements/datasets.txt   |   2 +-
 requirements/framework.txt  |   2 +-
 4 files changed, 182 insertions(+), 3 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index afa5cf8e..41c11282 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -661,6 +661,26 @@ class HubApi:
             files.append(file)
         return files
 
+    def file_exists(
+            self,
+            repo_id: str,
+            filename: str,
+            *,
+            revision: Optional[str] = None,
+    ):
+        """Get if the specified file exists
+
+        Args:
+            repo_id (`str`): The repo id to use
+            filename (`str`): The queried filename
+            revision (`Optional[str]`): The repo revision
+        Returns:
+            The query result in bool value
+        """
+        files = self.get_model_files(repo_id, revision=revision)
+        files = [file['Name'] for file in files]
+        return filename in files
+
     def create_dataset(self,
                        dataset_name: str,
                        namespace: str,
diff --git a/modelscope/utils/hf_util.py b/modelscope/utils/hf_util.py
index 14a3713c..01aeebef 100644
--- a/modelscope/utils/hf_util.py
+++ b/modelscope/utils/hf_util.py
@@ -1,5 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import importlib
 import os
+from pathlib import Path
+from types import MethodType
+from typing import Dict, Literal, Optional, Union
 
 from transformers import AutoConfig as AutoConfigHF
 from transformers import AutoImageProcessor as AutoImageProcessorHF
@@ -14,10 +18,12 @@ from transformers import AutoTokenizer as AutoTokenizerHF
 from transformers import BatchFeature as BatchFeatureHF
 from transformers import BitsAndBytesConfig as BitsAndBytesConfigHF
 from transformers import GenerationConfig as GenerationConfigHF
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers import (PretrainedConfig, PreTrainedModel,
+                          PreTrainedTokenizerBase)
 
 from modelscope import snapshot_download
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke
+from .logger import get_logger
 
 try:
     from transformers import GPTQConfig as GPTQConfigHF
@@ -26,6 +32,8 @@ except ImportError:
     GPTQConfigHF = None
     AwqConfigHF = None
 
+logger = get_logger()
+
 
 def user_agent(invoked_by=None):
     if invoked_by is None:
@@ -34,6 +42,157 @@ def user_agent(invoked_by=None):
     return uagent
 
 
+def _try_login(token: Optional[str] = None):
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    if token is None:
+        token = os.environ.get('MODELSCOPE_API_TOKEN')
+    if token:
+        api.login(token)
+
+
+def _file_exists(
+    self,
+    repo_id: str,
+    filename: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+):
+    """Patch huggingface_hub.file_exists"""
+    if repo_type is not None:
+        logger.warning(
+            'The passed in repo_type will not be used in modelscope. Now only model repo can be queried.'
+        )
+    _try_login(token)
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    return api.file_exists(repo_id, filename, revision=revision)
+
+
+def _file_download(repo_id: str,
+                   filename: str,
+                   *,
+                   subfolder: Optional[str] = None,
+                   repo_type: Optional[str] = None,
+                   revision: Optional[str] = None,
+                   cache_dir: Union[str, Path, None] = None,
+                   local_dir: Union[str, Path, None] = None,
+                   token: Union[bool, str, None] = None,
+                   local_files_only: bool = False,
+                   **kwargs):
+    """Patch huggingface_hub.hf_hub_download"""
+    if len(kwargs) > 0:
+        logger.warning(
+            'The passed in library_name,library_version,user_agent,force_download,proxies'
+            'etag_timeout,headers,endpoint '
+            'will not be used in modelscope.')
+    assert repo_type in (
+        None, 'model',
+        'dataset'), f'repo_type={repo_type} is not supported in ModelScope'
+    if repo_type in (None, 'model'):
+        from modelscope.hub.file_download import model_file_download as file_download
+    else:
+        from modelscope.hub.file_download import dataset_file_download as file_download
+    _try_login(token)
+    return file_download(
+        repo_id,
+        file_path=os.path.join(subfolder, filename) if subfolder else filename,
+        cache_dir=cache_dir,
+        local_dir=local_dir,
+        local_files_only=local_files_only,
+        revision=revision)
+
+
+def _patch_pretrained_class():
+
+    def get_model_dir(pretrained_model_name_or_path, ignore_file_pattern,
+                      **kwargs):
+        if not os.path.exists(pretrained_model_name_or_path):
+            revision = kwargs.pop('revision', None)
+            model_dir = snapshot_download(
+                pretrained_model_name_or_path,
+                revision=revision,
+                ignore_file_pattern=ignore_file_pattern)
+        else:
+            model_dir = pretrained_model_name_or_path
+        return model_dir
+
+    def patch_tokenizer_base():
+        """ Monkey patch PreTrainedTokenizerBase.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = PreTrainedTokenizerBase.from_pretrained.__func__
+
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
+            model_dir = get_model_dir(pretrained_model_name_or_path,
+                                      ignore_file_pattern, **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+
+        PreTrainedTokenizerBase.from_pretrained = from_pretrained
+
+    def patch_config_base():
+        """ Monkey patch PretrainedConfig.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = PretrainedConfig.from_pretrained.__func__
+        ori_get_config_dict = PretrainedConfig.get_config_dict.__func__
+
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
+            model_dir = get_model_dir(pretrained_model_name_or_path,
+                                      ignore_file_pattern, **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+
+        @classmethod
+        def get_config_dict(cls, pretrained_model_name_or_path, **kwargs):
+            ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
+            model_dir = get_model_dir(pretrained_model_name_or_path,
+                                      ignore_file_pattern, **kwargs)
+            return ori_get_config_dict(cls, model_dir, **kwargs)
+
+        PretrainedConfig.get_config_dict = get_config_dict
+
+    def patch_model_base():
+        """ Monkey patch PreTrainedModel.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = PreTrainedModel.from_pretrained.__func__
+
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            model_dir = get_model_dir(pretrained_model_name_or_path, None,
+                                      **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+
+        PreTrainedModel.from_pretrained = from_pretrained
+
+    patch_tokenizer_base()
+    patch_config_base()
+    patch_model_base()
+
+
+def patch_hub():
+    """Patch hf hub, which to make users can download models from modelscope to speed up.
+    """
+    import huggingface_hub
+    from huggingface_hub import hf_api
+    from huggingface_hub.hf_api import api
+
+    huggingface_hub.hf_hub_download = _file_download
+    huggingface_hub.file_download.hf_hub_download = _file_download
+
+    hf_api.file_exists = MethodType(_file_exists, api)
+    huggingface_hub.file_exists = hf_api.file_exists
+    huggingface_hub.hf_api.file_exists = hf_api.file_exists
+
+    _patch_pretrained_class()
+
+
 def get_wrapped_class(module_class, ignore_file_pattern=[], **kwargs):
     """Get a custom wrapper class for  auto classes to download the models from the ModelScope hub
     Args:
diff --git a/requirements/datasets.txt b/requirements/datasets.txt
index 6ca2d853..9035b3e6 100644
--- a/requirements/datasets.txt
+++ b/requirements/datasets.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=2.18.0
+datasets>=2.18.0,<3.0.0
 einops
 oss2
 Pillow
diff --git a/requirements/framework.txt b/requirements/framework.txt
index d3ac7876..23f5b639 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=2.18.0
+datasets>=2.18.0,<3.0.0
 einops
 oss2
 Pillow

From 74d97ea7e09636b3860be7067e3a4ae8a01bd803 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Sat, 14 Sep 2024 15:12:28 +0800
Subject: [PATCH 4/5] Refactor zero sized file downloading (#991)

---
 modelscope/hub/file_download.py  | 10 ++++++++--
 tests/hub/test_hub_empty_file.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 tests/hub/test_hub_empty_file.py

diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 542c42af..f1cbce6f 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -461,17 +461,23 @@ def http_get_model_file(
                 unit='B',
                 unit_scale=True,
                 unit_divisor=1024,
-                total=file_size,
+                total=file_size if file_size > 0 else 1,
                 initial=0,
                 desc='Downloading [' + file_name + ']',
             )
+            if file_size == 0:
+                # Avoid empty file server request
+                with open(temp_file_path, 'w+'):
+                    progress.update(1)
+                    progress.close()
+                    break
             partial_length = 0
             if os.path.exists(
                     temp_file_path):  # download partial, continue download
                 with open(temp_file_path, 'rb') as f:
                     partial_length = f.seek(0, io.SEEK_END)
                     progress.update(partial_length)
-            if partial_length >= file_size > 0:
+            if partial_length >= file_size:
                 break
             # closed range[], from 0.
             get_headers['Range'] = 'bytes=%s-%s' % (partial_length,
diff --git a/tests/hub/test_hub_empty_file.py b/tests/hub/test_hub_empty_file.py
new file mode 100644
index 00000000..b73b1a66
--- /dev/null
+++ b/tests/hub/test_hub_empty_file.py
@@ -0,0 +1,31 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+import shutil
+import tempfile
+import unittest
+
+from modelscope import snapshot_download
+
+
+class HubEmptyFile(unittest.TestCase):
+
+    def setUp(self):
+        temporary_dir = tempfile.mkdtemp()
+        self.work_dir = temporary_dir
+
+    def tearDown(self):
+        shutil.rmtree(self.work_dir, ignore_errors=True)
+
+    def test_download_empty_file(self):
+        model_dir = snapshot_download(
+            'tastelikefeet/test_empty_download', cache_dir=self.work_dir)
+        self.assertTrue(model_dir is not None)
+        self.assertTrue(os.path.exists(os.path.join(model_dir, '1.txt')))
+        self.assertTrue(
+            os.path.exists(os.path.join(model_dir, 'configuration.json')))
+        self.assertTrue(os.path.exists(os.path.join(model_dir, 'init.py')))
+        self.assertTrue(os.path.exists(os.path.join(model_dir, 'README.md')))
+
+
+if __name__ == '__main__':
+    unittest.main()

From d5c9c82340f39c0c63f32503725582a0959600aa Mon Sep 17 00:00:00 2001
From: suluyana <110878454+suluyana@users.noreply.github.com>
Date: Wed, 18 Sep 2024 08:48:20 +0800
Subject: [PATCH 5/5] Fix problems with serializing audio output in serving
 (#993)

* fix audio out

* fix value in json output

* fix audio out
---
 modelscope/utils/input_output.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/modelscope/utils/input_output.py b/modelscope/utils/input_output.py
index 37e875bc..50010baf 100644
--- a/modelscope/utils/input_output.py
+++ b/modelscope/utils/input_output.py
@@ -787,7 +787,12 @@ def pipeline_output_to_service_base64_output(task_name, pipeline_output):
         pipeline_output = pipeline_output[0]
     for key, value in pipeline_output.items():
         if key not in task_outputs:
-            json_serializable_output[key] = value
+            import torch
+            if isinstance(value, torch.Tensor):
+                v = np.array(value.cpu()).tolist()
+            else:
+                v = value
+            json_serializable_output[key] = v
             continue  # skip the output not defined.
         if key in [
                 OutputKeys.OUTPUT_IMG, OutputKeys.OUTPUT_IMGS,