diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 4fdf7351..f5a42ca4 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -3,7 +3,7 @@ name: Bug report about: Create a bug report to help us improve title: '' labels: '' -assignees: Firmament-cyou, tastelikefeet, wangxingjun778, wenmengzhou, zzclynn +assignees: tastelikefeet, wangxingjun778, yingdachen --- @@ -36,14 +36,14 @@ A clear and concise description of what the bug is. Please @ corresponding people according to your problem: -Model related: @wenmengzhou @tastelikefeet +Model related: @tastelikefeet -Model hub related: @liuyhwangyh +Model hub related: @liuyhwangyh @tastelikefeet @wangxingjun778 Dataset releated: @wangxingjun778 Finetune related: @tastelikefeet @Jintao-Huang -Pipeline related: @Firmament-cyou @wenmengzhou +Pipeline related: @tastelikefeet @wangxingjun778 -Contribute your model: @zzclynn +Contribute your model: @yingdachen diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 0731f3c1..6eef2aa5 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -3,7 +3,7 @@ name: Feature request about: Suggest an idea for this project title: '' labels: '' -assignees: tastelikefeet, wangxingjun778, wenmengzhou, yingdachen, zzclynn +assignees: yingdachen, wangxingjun778, tastelikefeet --- diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index c7ec7256..3545e543 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -3,7 +3,7 @@ name: Question about: Describe this issue template's purpose here. title: '' labels: '' -assignees: zzclynn,wenmengzhou +assignees: tastelikefeet, wangxingjun778, yingdachen --- @@ -18,7 +18,7 @@ Before asking a question, make sure you have: Please @ corresponding people according to your problem: -Model related: @wenmengzhou @tastelikefeet +Model related: @tastelikefeet Model hub related: @liuyhwangyh @@ -26,6 +26,6 @@ Dataset releated: @wangxingjun778 Finetune related: @tastelikefeet @Jintao-Huang -Pipeline related: @Firmament-cyou @wenmengzhou +Pipeline related: @tastelikefeet @wangxingjun778 -Contribute your model: @zzclynn +Contribute your model: @yingdachen diff --git a/modelscope/cli/clearcache.py b/modelscope/cli/clearcache.py new file mode 100644 index 00000000..7b89103b --- /dev/null +++ b/modelscope/cli/clearcache.py @@ -0,0 +1,107 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +from argparse import ArgumentParser +from pathlib import Path + +from modelscope.cli.base import CLICommand +from modelscope.hub.constants import TEMPORARY_FOLDER_NAME + + +def subparser_func(args): + """ Function which will be called for a specific sub parser. + """ + return ClearCacheCMD(args) + + +class ClearCacheCMD(CLICommand): + name = 'clear-cache' + + def __init__(self, args): + self.args = args + self.cache_dir = os.getenv( + 'MODELSCOPE_CACHE', + Path.home().joinpath('.cache', 'modelscope')) + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for clear-cache command. + """ + parser = parsers.add_parser(ClearCacheCMD.name) + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--model', + type=str, + help= + 'The id of the model whose cache will be cleared. For clear-cache, ' + 'if neither model or dataset id is provided, entire cache will be cleared.' + ) + group.add_argument( + '--dataset', + type=str, + help= + 'The id of the dataset whose cache will be cleared. For clear-cache, ' + 'if neither model or dataset id is provided, entire cache will be cleared.' + ) + + parser.set_defaults(func=subparser_func) + + def execute(self): + self._execute_with_confirmation() + + def _execute_with_confirmation(self): + all = False + single_model = False + prompt = '\nYou are about to delete ' + + if self.args.model or self.args.dataset: + if self.args.model: + id = self.args.model + single_model = True + prompt = prompt + f'local cache for model {id}. ' + else: + id = self.args.dataset + prompt = prompt + f'local cache for dataset {id}. ' + else: + prompt = prompt + f'entire ModelScope cache at {self.cache_dir}, including ALL models and dataset.\n' + all = True + user_input = input( + prompt + + '\nPlease press Y or y to proceed, any other key to abort.\n' + ).strip().upper() + + if user_input == 'Y': + if all: + self._remove_directory(self.cache_dir) + print('Cache cleared.') + else: + entity_directory = os.path.join( + self.cache_dir, 'hub' if single_model else 'datasets', id) + temp_directory = os.path.join( + self.cache_dir, 'hub' if single_model else 'datasets', + TEMPORARY_FOLDER_NAME, id) + entity_removed = self._remove_directory(entity_directory) + temp_removed = self._remove_directory(temp_directory) + if (not entity_removed) and (not temp_removed): + if single_model: + print( + f'Cache for Model {id} not found. Nothing to do.') + else: + print( + f'Cache for Dataset {id} not found. Nothing to do.' + ) + else: + print('Cache cleared.') + else: + print('Operation aborted.') + return + + def _remove_directory(self, path): + if os.path.exists(path): + try: + shutil.rmtree(path) + print(f'Cache folder {path} removed.') + return True + except Exception as e: + print(f'An error occurred while clearing cache at {path}: {e}') + return False diff --git a/modelscope/cli/cli.py b/modelscope/cli/cli.py index 5e3fcbfd..74fb05db 100644 --- a/modelscope/cli/cli.py +++ b/modelscope/cli/cli.py @@ -3,6 +3,7 @@ import argparse import logging +from modelscope.cli.clearcache import ClearCacheCMD from modelscope.cli.download import DownloadCMD from modelscope.cli.login import LoginCMD from modelscope.cli.modelcard import ModelCardCMD @@ -23,6 +24,7 @@ def run_cmd(): subparsers = parser.add_subparsers(help='modelscope commands helpers') DownloadCMD.define_args(subparsers) + ClearCacheCMD.define_args(subparsers) PluginsCMD.define_args(subparsers) PipelineCMD.define_args(subparsers) ModelCardCMD.define_args(subparsers) diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index 36204d93..5b3a8bb7 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -555,7 +555,7 @@ def get_module_without_script(self) -> DatasetModule: download_config = self.download_config.copy() if download_config.download_desc is None: - download_config.download_desc = 'Downloading readme' + download_config.download_desc = 'Downloading [README.md]' try: url_or_filename = _ms_api.get_dataset_file_url( file_name='README.md', @@ -989,7 +989,6 @@ class DatasetsWrapperHF: download_config=download_config, download_mode=download_mode, verification_mode=verification_mode, - try_from_hf_gcs=False, num_proc=num_proc, storage_options=storage_options, # base_path=builder_instance.base_path, diff --git a/modelscope/msdatasets/utils/hf_file_utils.py b/modelscope/msdatasets/utils/hf_file_utils.py index b2931f7e..863bb196 100644 --- a/modelscope/msdatasets/utils/hf_file_utils.py +++ b/modelscope/msdatasets/utils/hf_file_utils.py @@ -5,27 +5,138 @@ import json import os import re +import copy import shutil +import time import warnings -import inspect from contextlib import contextmanager from functools import partial from pathlib import Path +from typing import Optional, Union from urllib.parse import urljoin, urlparse import requests +from tqdm import tqdm from datasets import config -from datasets.utils.file_utils import hash_url_to_filename, get_authentication_headers_for_url, ftp_head, fsspec_head, \ - http_head, _raise_if_offline_mode_is_enabled, ftp_get, fsspec_get, http_get +from datasets.utils.file_utils import hash_url_to_filename, \ + get_authentication_headers_for_url, fsspec_head, fsspec_get from filelock import FileLock from modelscope.utils.config_ds import MS_DATASETS_CACHE from modelscope.utils.logger import get_logger from modelscope.hub.api import ModelScopeConfig +from modelscope import __version__ + logger = get_logger() +def get_datasets_user_agent_ms(user_agent: Optional[Union[str, dict]] = None) -> str: + ua = f'datasets/{__version__}' + ua += f'; python/{config.PY_VERSION}' + ua += f'; pyarrow/{config.PYARROW_VERSION}' + if config.TORCH_AVAILABLE: + ua += f'; torch/{config.TORCH_VERSION}' + if config.TF_AVAILABLE: + ua += f'; tensorflow/{config.TF_VERSION}' + if config.JAX_AVAILABLE: + ua += f'; jax/{config.JAX_VERSION}' + if isinstance(user_agent, dict): + ua += f"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}" + elif isinstance(user_agent, str): + ua += '; ' + user_agent + return ua + + +def _request_with_retry_ms( + method: str, + url: str, + max_retries: int = 2, + base_wait_time: float = 0.5, + max_wait_time: float = 2, + timeout: float = 10.0, + **params, +) -> requests.Response: + """Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff. + + Note that if the environment variable HF_DATASETS_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised. + + Args: + method (str): HTTP method, such as 'GET' or 'HEAD'. + url (str): The URL of the resource to fetch. + max_retries (int): Maximum number of retries, defaults to 0 (no retries). + base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between + retries then grows exponentially, capped by max_wait_time. + max_wait_time (float): Maximum amount of time between two retries, in seconds. + **params (additional keyword arguments): Params to pass to :obj:`requests.request`. + """ + tries, success = 0, False + response = None + while not success: + tries += 1 + try: + response = requests.request(method=method.upper(), url=url, timeout=timeout, **params) + success = True + except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as err: + if tries > max_retries: + raise err + else: + logger.info(f'{method} request to {url} timed out, retrying... [{tries/max_retries}]') + sleep_time = min(max_wait_time, base_wait_time * 2 ** (tries - 1)) # Exponential backoff + time.sleep(sleep_time) + return response + + +def http_head_ms( + url, proxies=None, headers=None, cookies=None, allow_redirects=True, timeout=10.0, max_retries=0 +) -> requests.Response: + headers = copy.deepcopy(headers) or {} + headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent')) + response = _request_with_retry_ms( + method='HEAD', + url=url, + proxies=proxies, + headers=headers, + cookies=cookies, + allow_redirects=allow_redirects, + timeout=timeout, + max_retries=max_retries, + ) + return response + + +def http_get_ms( + url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0, desc=None +) -> Optional[requests.Response]: + headers = dict(headers) if headers is not None else {} + headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent')) + if resume_size > 0: + headers['Range'] = f'bytes={resume_size:d}-' + response = _request_with_retry_ms( + method='GET', + url=url, + stream=True, + proxies=proxies, + headers=headers, + cookies=cookies, + max_retries=max_retries, + timeout=timeout, + ) + if temp_file is None: + return response + if response.status_code == 416: # Range not satisfiable + return + content_length = response.headers.get('Content-Length') + total = resume_size + int(content_length) if content_length is not None else None + + progress = tqdm(total=total, initial=resume_size, unit_scale=True, unit='B', desc=desc or 'Downloading') + for chunk in response.iter_content(chunk_size=1024): + progress.update(len(chunk)) + temp_file.write(chunk) + + progress.close() + + def get_from_cache_ms( url, cache_dir=None, @@ -42,7 +153,7 @@ def get_from_cache_ms( ignore_url_params=False, storage_options=None, download_desc=None, - disable_tqdm=False, + disable_tqdm=None, ) -> str: """ Given a URL, look for the corresponding file in the local cache. @@ -88,6 +199,8 @@ def get_from_cache_ms( # if we don't ask for 'force_download' then we spare a request filename = hash_url_to_filename(cached_url, etag=None) cache_path = os.path.join(cache_dir, filename) + if download_desc is None: + download_desc = 'Downloading [' + filename + ']' if os.path.exists(cache_path) and not force_download and not use_etag: return cache_path @@ -100,16 +213,14 @@ def get_from_cache_ms( # We don't have the file locally or we need an eTag if not local_files_only: scheme = urlparse(url).scheme - if scheme == 'ftp': - connected = ftp_head(url) - elif scheme not in ('http', 'https'): + if scheme not in ('http', 'https'): response = fsspec_head(url, storage_options=storage_options) # s3fs uses "ETag", gcsfs uses "etag" etag = (response.get('ETag', None) or response.get('etag', None)) if use_etag else None connected = True try: cookies = ModelScopeConfig.get_cookies() - response = http_head( + response = http_head_ms( url, allow_redirects=True, proxies=proxies, @@ -166,7 +277,6 @@ def get_from_cache_ms( ) elif response is not None and response.status_code == 404: raise FileNotFoundError(f"Couldn't find file at {url}") - _raise_if_offline_mode_is_enabled(f'Tried to reach {url}') if head_error is not None: raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})") elif response is not None: @@ -205,48 +315,21 @@ def get_from_cache_ms( # Download to temporary file, then copy to cache path once finished. # Otherwise, you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: - logger.info(f'Downloading to {temp_file.name}') # GET file object - if scheme == 'ftp': - ftp_get(url, temp_file) - elif scheme not in ('http', 'https'): - fsspec_get_sig = inspect.signature(fsspec_get) - if 'disable_tqdm' in fsspec_get_sig.parameters: - fsspec_get(url, - temp_file, - storage_options=storage_options, - desc=download_desc, - disable_tqdm=disable_tqdm - ) - else: - fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc) + if scheme not in ('http', 'https'): + fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc) else: - http_get_sig = inspect.signature(http_get) - - if 'disable_tqdm' in http_get_sig.parameters: - http_get( - url, - temp_file=temp_file, - proxies=proxies, - resume_size=resume_size, - headers=headers, - cookies=cookies, - max_retries=max_retries, - desc=download_desc, - disable_tqdm=disable_tqdm, - ) - else: - http_get( - url, - temp_file=temp_file, - proxies=proxies, - resume_size=resume_size, - headers=headers, - cookies=cookies, - max_retries=max_retries, - desc=download_desc, - ) + http_get_ms( + url, + temp_file=temp_file, + proxies=proxies, + resume_size=resume_size, + headers=headers, + cookies=cookies, + max_retries=max_retries, + desc=download_desc, + ) logger.info(f'storing {url} in cache at {cache_path}') shutil.move(temp_file.name, cache_path) diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py index e3ed9f89..e286802b 100644 --- a/modelscope/preprocessors/templates/loader.py +++ b/modelscope/preprocessors/templates/loader.py @@ -83,7 +83,7 @@ template_info = [ TemplateInfo( template=TemplateType.chatml, template_regex= - f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*{chat_suffix}.*', + f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*', modelfile_link= 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-1.5.modelfile', ), @@ -110,6 +110,10 @@ template_info = [ 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/glm4.modelfile', ), + TemplateInfo( + template_regex=f'.*{cases("llava-llama-3")}.*', + modelfile_link='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava-llama-3.modelfile'), + # baichuan TemplateInfo( template=TemplateType.baichuan, diff --git a/modelscope/utils/hf_util.py b/modelscope/utils/hf_util.py index 01aeebef..f5fb8d33 100644 --- a/modelscope/utils/hf_util.py +++ b/modelscope/utils/hf_util.py @@ -127,7 +127,9 @@ def _patch_pretrained_class(): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors'] + ignore_file_pattern = [ + r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt' + ] model_dir = get_model_dir(pretrained_model_name_or_path, ignore_file_pattern, **kwargs) return ori_from_pretrained(cls, model_dir, *model_args, **kwargs) @@ -143,14 +145,18 @@ def _patch_pretrained_class(): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors'] + ignore_file_pattern = [ + r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt' + ] model_dir = get_model_dir(pretrained_model_name_or_path, ignore_file_pattern, **kwargs) return ori_from_pretrained(cls, model_dir, *model_args, **kwargs) @classmethod def get_config_dict(cls, pretrained_model_name_or_path, **kwargs): - ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors'] + ignore_file_pattern = [ + r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt' + ] model_dir = get_model_dir(pretrained_model_name_or_path, ignore_file_pattern, **kwargs) return ori_get_config_dict(cls, model_dir, **kwargs) @@ -242,11 +248,20 @@ AutoModelForTokenClassification = get_wrapped_class( AutoModelForTokenClassificationHF) AutoTokenizer = get_wrapped_class( - AutoTokenizerHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors']) + AutoTokenizerHF, + ignore_file_pattern=[ + r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt' + ]) AutoConfig = get_wrapped_class( - AutoConfigHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors']) + AutoConfigHF, + ignore_file_pattern=[ + r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt' + ]) GenerationConfig = get_wrapped_class( - GenerationConfigHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors']) + GenerationConfigHF, + ignore_file_pattern=[ + r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt' + ]) GPTQConfig = GPTQConfigHF AwqConfig = AwqConfigHF BitsAndBytesConfig = BitsAndBytesConfigHF diff --git a/requirements/datasets.txt b/requirements/datasets.txt index 9035b3e6..b290664e 100644 --- a/requirements/datasets.txt +++ b/requirements/datasets.txt @@ -1,6 +1,6 @@ addict attrs -datasets>=2.18.0,<3.0.0 +datasets>=3.0.0 einops oss2 Pillow diff --git a/requirements/framework.txt b/requirements/framework.txt index 23f5b639..dabab41f 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,6 +1,6 @@ addict attrs -datasets>=2.18.0,<3.0.0 +datasets>=3.0.0 einops oss2 Pillow diff --git a/tests/msdatasets/test_stream_load.py b/tests/msdatasets/test_stream_load.py index 3252d8cf..0ce46887 100644 --- a/tests/msdatasets/test_stream_load.py +++ b/tests/msdatasets/test_stream_load.py @@ -44,6 +44,15 @@ class TestStreamLoad(unittest.TestCase): assert sample['question'], f'Failed to load sample from {repo_id}' + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_stream_swift_jsonl(self): + repo_id: str = 'iic/MSAgent-MultiRole' + ds = MsDataset.load(repo_id, split='train', use_streaming=True) + sample = next(iter(ds)) + logger.info(sample) + + assert sample['id'], f'Failed to load sample from {repo_id}' + if __name__ == '__main__': unittest.main() diff --git a/tests/tools/test_to_ollama.py b/tests/tools/test_to_ollama.py index aaf5f4d0..ba92c1ea 100644 --- a/tests/tools/test_to_ollama.py +++ b/tests/tools/test_to_ollama.py @@ -100,6 +100,11 @@ class TestToOllama(unittest.TestCase): ollama = TemplateLoader.to_ollama( 'QuantFactory/Mistral-Nemo-Japanese-Instruct-2408-GGUF') self.assertTrue(ollama is not None) + ollama = TemplateLoader.to_ollama('AI-ModelScope/Yi-1.5-9B-32K-GGUF') + self.assertTrue(ollama is not None) + ollama = TemplateLoader.to_ollama( + 'AI-ModelScope/llava-llama-3-8b-v1_1-gguf') + self.assertTrue(ollama is not None) if __name__ == '__main__':