From 469159de34ce92da3fb86abd2b9640c0ed153a3d Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Wed, 11 Sep 2024 11:40:37 +0800 Subject: [PATCH 1/5] use tqdm auto (#982) Co-authored-by: Yingda Chen --- modelscope/hub/api.py | 2 +- modelscope/hub/file_download.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 99eccd16..afa5cf8e 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -834,7 +834,7 @@ class HubApi: Fetch the meta-data files from the url, e.g. csv/jsonl files. """ import hashlib - from tqdm import tqdm + from tqdm.auto import tqdm import pandas as pd out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest()) diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index 7bbc49e1..50b9e8cb 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -14,7 +14,7 @@ from typing import Dict, Optional, Union import requests from requests.adapters import Retry -from tqdm import tqdm +from tqdm.auto import tqdm from modelscope.hub.api import HubApi, ModelScopeConfig from modelscope.hub.constants import ( From 51b33cecefed4daad3dccc47e0da60d5923ce8de Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Wed, 11 Sep 2024 19:35:02 +0800 Subject: [PATCH 2/5] Support create file with size 0 (#984) * Support file size == 0 --- modelscope/hub/file_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index 50b9e8cb..542c42af 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -471,7 +471,7 @@ def http_get_model_file( with open(temp_file_path, 'rb') as f: partial_length = f.seek(0, io.SEEK_END) progress.update(partial_length) - if partial_length >= file_size: + if partial_length >= file_size > 0: break # closed range[], from 0. get_headers['Range'] = 'bytes=%s-%s' % (partial_length, From 4c518db4246507155e8be381685df419ced3b2b5 Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Sat, 14 Sep 2024 12:24:19 +0800 Subject: [PATCH 3/5] patch hf hub (#987) --- modelscope/hub/api.py | 20 +++++ modelscope/utils/hf_util.py | 161 +++++++++++++++++++++++++++++++++++- requirements/datasets.txt | 2 +- requirements/framework.txt | 2 +- 4 files changed, 182 insertions(+), 3 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index afa5cf8e..41c11282 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -661,6 +661,26 @@ class HubApi: files.append(file) return files + def file_exists( + self, + repo_id: str, + filename: str, + *, + revision: Optional[str] = None, + ): + """Get if the specified file exists + + Args: + repo_id (`str`): The repo id to use + filename (`str`): The queried filename + revision (`Optional[str]`): The repo revision + Returns: + The query result in bool value + """ + files = self.get_model_files(repo_id, revision=revision) + files = [file['Name'] for file in files] + return filename in files + def create_dataset(self, dataset_name: str, namespace: str, diff --git a/modelscope/utils/hf_util.py b/modelscope/utils/hf_util.py index 14a3713c..01aeebef 100644 --- a/modelscope/utils/hf_util.py +++ b/modelscope/utils/hf_util.py @@ -1,5 +1,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import importlib import os +from pathlib import Path +from types import MethodType +from typing import Dict, Literal, Optional, Union from transformers import AutoConfig as AutoConfigHF from transformers import AutoImageProcessor as AutoImageProcessorHF @@ -14,10 +18,12 @@ from transformers import AutoTokenizer as AutoTokenizerHF from transformers import BatchFeature as BatchFeatureHF from transformers import BitsAndBytesConfig as BitsAndBytesConfigHF from transformers import GenerationConfig as GenerationConfigHF -from transformers import PreTrainedModel, PreTrainedTokenizerBase +from transformers import (PretrainedConfig, PreTrainedModel, + PreTrainedTokenizerBase) from modelscope import snapshot_download from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke +from .logger import get_logger try: from transformers import GPTQConfig as GPTQConfigHF @@ -26,6 +32,8 @@ except ImportError: GPTQConfigHF = None AwqConfigHF = None +logger = get_logger() + def user_agent(invoked_by=None): if invoked_by is None: @@ -34,6 +42,157 @@ def user_agent(invoked_by=None): return uagent +def _try_login(token: Optional[str] = None): + from modelscope.hub.api import HubApi + api = HubApi() + if token is None: + token = os.environ.get('MODELSCOPE_API_TOKEN') + if token: + api.login(token) + + +def _file_exists( + self, + repo_id: str, + filename: str, + *, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + token: Union[str, bool, None] = None, +): + """Patch huggingface_hub.file_exists""" + if repo_type is not None: + logger.warning( + 'The passed in repo_type will not be used in modelscope. Now only model repo can be queried.' + ) + _try_login(token) + from modelscope.hub.api import HubApi + api = HubApi() + return api.file_exists(repo_id, filename, revision=revision) + + +def _file_download(repo_id: str, + filename: str, + *, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + cache_dir: Union[str, Path, None] = None, + local_dir: Union[str, Path, None] = None, + token: Union[bool, str, None] = None, + local_files_only: bool = False, + **kwargs): + """Patch huggingface_hub.hf_hub_download""" + if len(kwargs) > 0: + logger.warning( + 'The passed in library_name,library_version,user_agent,force_download,proxies' + 'etag_timeout,headers,endpoint ' + 'will not be used in modelscope.') + assert repo_type in ( + None, 'model', + 'dataset'), f'repo_type={repo_type} is not supported in ModelScope' + if repo_type in (None, 'model'): + from modelscope.hub.file_download import model_file_download as file_download + else: + from modelscope.hub.file_download import dataset_file_download as file_download + _try_login(token) + return file_download( + repo_id, + file_path=os.path.join(subfolder, filename) if subfolder else filename, + cache_dir=cache_dir, + local_dir=local_dir, + local_files_only=local_files_only, + revision=revision) + + +def _patch_pretrained_class(): + + def get_model_dir(pretrained_model_name_or_path, ignore_file_pattern, + **kwargs): + if not os.path.exists(pretrained_model_name_or_path): + revision = kwargs.pop('revision', None) + model_dir = snapshot_download( + pretrained_model_name_or_path, + revision=revision, + ignore_file_pattern=ignore_file_pattern) + else: + model_dir = pretrained_model_name_or_path + return model_dir + + def patch_tokenizer_base(): + """ Monkey patch PreTrainedTokenizerBase.from_pretrained to adapt to modelscope hub. + """ + ori_from_pretrained = PreTrainedTokenizerBase.from_pretrained.__func__ + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, + **kwargs): + ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors'] + model_dir = get_model_dir(pretrained_model_name_or_path, + ignore_file_pattern, **kwargs) + return ori_from_pretrained(cls, model_dir, *model_args, **kwargs) + + PreTrainedTokenizerBase.from_pretrained = from_pretrained + + def patch_config_base(): + """ Monkey patch PretrainedConfig.from_pretrained to adapt to modelscope hub. + """ + ori_from_pretrained = PretrainedConfig.from_pretrained.__func__ + ori_get_config_dict = PretrainedConfig.get_config_dict.__func__ + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, + **kwargs): + ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors'] + model_dir = get_model_dir(pretrained_model_name_or_path, + ignore_file_pattern, **kwargs) + return ori_from_pretrained(cls, model_dir, *model_args, **kwargs) + + @classmethod + def get_config_dict(cls, pretrained_model_name_or_path, **kwargs): + ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors'] + model_dir = get_model_dir(pretrained_model_name_or_path, + ignore_file_pattern, **kwargs) + return ori_get_config_dict(cls, model_dir, **kwargs) + + PretrainedConfig.get_config_dict = get_config_dict + + def patch_model_base(): + """ Monkey patch PreTrainedModel.from_pretrained to adapt to modelscope hub. + """ + ori_from_pretrained = PreTrainedModel.from_pretrained.__func__ + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, + **kwargs): + model_dir = get_model_dir(pretrained_model_name_or_path, None, + **kwargs) + return ori_from_pretrained(cls, model_dir, *model_args, **kwargs) + + PreTrainedModel.from_pretrained = from_pretrained + + patch_tokenizer_base() + patch_config_base() + patch_model_base() + + +def patch_hub(): + """Patch hf hub, which to make users can download models from modelscope to speed up. + """ + import huggingface_hub + from huggingface_hub import hf_api + from huggingface_hub.hf_api import api + + huggingface_hub.hf_hub_download = _file_download + huggingface_hub.file_download.hf_hub_download = _file_download + + hf_api.file_exists = MethodType(_file_exists, api) + huggingface_hub.file_exists = hf_api.file_exists + huggingface_hub.hf_api.file_exists = hf_api.file_exists + + _patch_pretrained_class() + + def get_wrapped_class(module_class, ignore_file_pattern=[], **kwargs): """Get a custom wrapper class for auto classes to download the models from the ModelScope hub Args: diff --git a/requirements/datasets.txt b/requirements/datasets.txt index 6ca2d853..9035b3e6 100644 --- a/requirements/datasets.txt +++ b/requirements/datasets.txt @@ -1,6 +1,6 @@ addict attrs -datasets>=2.18.0 +datasets>=2.18.0,<3.0.0 einops oss2 Pillow diff --git a/requirements/framework.txt b/requirements/framework.txt index d3ac7876..23f5b639 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,6 +1,6 @@ addict attrs -datasets>=2.18.0 +datasets>=2.18.0,<3.0.0 einops oss2 Pillow From 74d97ea7e09636b3860be7067e3a4ae8a01bd803 Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Sat, 14 Sep 2024 15:12:28 +0800 Subject: [PATCH 4/5] Refactor zero sized file downloading (#991) --- modelscope/hub/file_download.py | 10 ++++++++-- tests/hub/test_hub_empty_file.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 tests/hub/test_hub_empty_file.py diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index 542c42af..f1cbce6f 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -461,17 +461,23 @@ def http_get_model_file( unit='B', unit_scale=True, unit_divisor=1024, - total=file_size, + total=file_size if file_size > 0 else 1, initial=0, desc='Downloading [' + file_name + ']', ) + if file_size == 0: + # Avoid empty file server request + with open(temp_file_path, 'w+'): + progress.update(1) + progress.close() + break partial_length = 0 if os.path.exists( temp_file_path): # download partial, continue download with open(temp_file_path, 'rb') as f: partial_length = f.seek(0, io.SEEK_END) progress.update(partial_length) - if partial_length >= file_size > 0: + if partial_length >= file_size: break # closed range[], from 0. get_headers['Range'] = 'bytes=%s-%s' % (partial_length, diff --git a/tests/hub/test_hub_empty_file.py b/tests/hub/test_hub_empty_file.py new file mode 100644 index 00000000..b73b1a66 --- /dev/null +++ b/tests/hub/test_hub_empty_file.py @@ -0,0 +1,31 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path +import shutil +import tempfile +import unittest + +from modelscope import snapshot_download + + +class HubEmptyFile(unittest.TestCase): + + def setUp(self): + temporary_dir = tempfile.mkdtemp() + self.work_dir = temporary_dir + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_download_empty_file(self): + model_dir = snapshot_download( + 'tastelikefeet/test_empty_download', cache_dir=self.work_dir) + self.assertTrue(model_dir is not None) + self.assertTrue(os.path.exists(os.path.join(model_dir, '1.txt'))) + self.assertTrue( + os.path.exists(os.path.join(model_dir, 'configuration.json'))) + self.assertTrue(os.path.exists(os.path.join(model_dir, 'init.py'))) + self.assertTrue(os.path.exists(os.path.join(model_dir, 'README.md'))) + + +if __name__ == '__main__': + unittest.main() From d5c9c82340f39c0c63f32503725582a0959600aa Mon Sep 17 00:00:00 2001 From: suluyana <110878454+suluyana@users.noreply.github.com> Date: Wed, 18 Sep 2024 08:48:20 +0800 Subject: [PATCH 5/5] Fix problems with serializing audio output in serving (#993) * fix audio out * fix value in json output * fix audio out --- modelscope/utils/input_output.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modelscope/utils/input_output.py b/modelscope/utils/input_output.py index 37e875bc..50010baf 100644 --- a/modelscope/utils/input_output.py +++ b/modelscope/utils/input_output.py @@ -787,7 +787,12 @@ def pipeline_output_to_service_base64_output(task_name, pipeline_output): pipeline_output = pipeline_output[0] for key, value in pipeline_output.items(): if key not in task_outputs: - json_serializable_output[key] = value + import torch + if isinstance(value, torch.Tensor): + v = np.array(value.cpu()).tolist() + else: + v = value + json_serializable_output[key] = v continue # skip the output not defined. if key in [ OutputKeys.OUTPUT_IMG, OutputKeys.OUTPUT_IMGS,