diff --git a/.dev_scripts/build_image.sh b/.dev_scripts/build_image.sh index 5ed5adf8..f52981b2 100644 --- a/.dev_scripts/build_image.sh +++ b/.dev_scripts/build_image.sh @@ -159,7 +159,7 @@ docker_file_content=`cat docker/Dockerfile.ubuntu` BUILD_HASH_ID=$(git rev-parse HEAD) # install thrid part library -docker_file_content="${docker_file_content} \nRUN export COMMIT_ID=$BUILD_HASH_ID && pip install --no-cache-dir -U adaseq pai-easycv && pip install --no-cache-dir -U 'ms-swift' 'decord' 'qwen_vl_utils' 'pyav' 'librosa' 'funasr' 'timm>0.9.5' 'accelerate' 'gradio' 'peft' 'optimum' 'trl' 'transformers'" +docker_file_content="${docker_file_content} \nRUN export COMMIT_ID=$BUILD_HASH_ID && pip install --no-cache-dir -U adaseq pai-easycv && pip install --no-cache-dir -U 'git+https://github.com/modelscope/ms-swift.git@release/2.5' 'decord' 'qwen_vl_utils' 'pyav' 'librosa' 'funasr' 'timm>0.9.5' 'transformers' 'accelerate' 'gradio' 'peft' 'optimum' 'trl'" docker_file_content="${docker_file_content} \nRUN pip uninstall modelscope -y && export COMMIT_ID=$BUILD_HASH_ID && cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b $build_branch --single-branch $REPO_URL && cd modelscope && pip install . && cd / && rm -fr /tmp/modelscope && pip cache purge;" diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 4fdf7351..f5a42ca4 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -3,7 +3,7 @@ name: Bug report about: Create a bug report to help us improve title: '' labels: '' -assignees: Firmament-cyou, tastelikefeet, wangxingjun778, wenmengzhou, zzclynn +assignees: tastelikefeet, wangxingjun778, yingdachen --- @@ -36,14 +36,14 @@ A clear and concise description of what the bug is. Please @ corresponding people according to your problem: -Model related: @wenmengzhou @tastelikefeet +Model related: @tastelikefeet -Model hub related: @liuyhwangyh +Model hub related: @liuyhwangyh @tastelikefeet @wangxingjun778 Dataset releated: @wangxingjun778 Finetune related: @tastelikefeet @Jintao-Huang -Pipeline related: @Firmament-cyou @wenmengzhou +Pipeline related: @tastelikefeet @wangxingjun778 -Contribute your model: @zzclynn +Contribute your model: @yingdachen diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 0731f3c1..6eef2aa5 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -3,7 +3,7 @@ name: Feature request about: Suggest an idea for this project title: '' labels: '' -assignees: tastelikefeet, wangxingjun778, wenmengzhou, yingdachen, zzclynn +assignees: yingdachen, wangxingjun778, tastelikefeet --- diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index c7ec7256..3545e543 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -3,7 +3,7 @@ name: Question about: Describe this issue template's purpose here. title: '' labels: '' -assignees: zzclynn,wenmengzhou +assignees: tastelikefeet, wangxingjun778, yingdachen --- @@ -18,7 +18,7 @@ Before asking a question, make sure you have: Please @ corresponding people according to your problem: -Model related: @wenmengzhou @tastelikefeet +Model related: @tastelikefeet Model hub related: @liuyhwangyh @@ -26,6 +26,6 @@ Dataset releated: @wangxingjun778 Finetune related: @tastelikefeet @Jintao-Huang -Pipeline related: @Firmament-cyou @wenmengzhou +Pipeline related: @tastelikefeet @wangxingjun778 -Contribute your model: @zzclynn +Contribute your model: @yingdachen diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7e6e9b77..a8565f16 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,5 @@ +exclude: 'modelscope/preprocessors/templates/' + repos: - repo: https://github.com/pycqa/flake8.git rev: 4.0.0 diff --git a/.pre-commit-config_local.yaml b/.pre-commit-config_local.yaml index a68a5b78..869d8fd6 100644 --- a/.pre-commit-config_local.yaml +++ b/.pre-commit-config_local.yaml @@ -1,3 +1,5 @@ +exclude: 'modelscope/preprocessors/templates/' + repos: - repo: /home/admin/pre-commit/flake8 rev: 4.0.0 diff --git a/modelscope/cli/clearcache.py b/modelscope/cli/clearcache.py new file mode 100644 index 00000000..7b89103b --- /dev/null +++ b/modelscope/cli/clearcache.py @@ -0,0 +1,107 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +from argparse import ArgumentParser +from pathlib import Path + +from modelscope.cli.base import CLICommand +from modelscope.hub.constants import TEMPORARY_FOLDER_NAME + + +def subparser_func(args): + """ Function which will be called for a specific sub parser. + """ + return ClearCacheCMD(args) + + +class ClearCacheCMD(CLICommand): + name = 'clear-cache' + + def __init__(self, args): + self.args = args + self.cache_dir = os.getenv( + 'MODELSCOPE_CACHE', + Path.home().joinpath('.cache', 'modelscope')) + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for clear-cache command. + """ + parser = parsers.add_parser(ClearCacheCMD.name) + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--model', + type=str, + help= + 'The id of the model whose cache will be cleared. For clear-cache, ' + 'if neither model or dataset id is provided, entire cache will be cleared.' + ) + group.add_argument( + '--dataset', + type=str, + help= + 'The id of the dataset whose cache will be cleared. For clear-cache, ' + 'if neither model or dataset id is provided, entire cache will be cleared.' + ) + + parser.set_defaults(func=subparser_func) + + def execute(self): + self._execute_with_confirmation() + + def _execute_with_confirmation(self): + all = False + single_model = False + prompt = '\nYou are about to delete ' + + if self.args.model or self.args.dataset: + if self.args.model: + id = self.args.model + single_model = True + prompt = prompt + f'local cache for model {id}. ' + else: + id = self.args.dataset + prompt = prompt + f'local cache for dataset {id}. ' + else: + prompt = prompt + f'entire ModelScope cache at {self.cache_dir}, including ALL models and dataset.\n' + all = True + user_input = input( + prompt + + '\nPlease press Y or y to proceed, any other key to abort.\n' + ).strip().upper() + + if user_input == 'Y': + if all: + self._remove_directory(self.cache_dir) + print('Cache cleared.') + else: + entity_directory = os.path.join( + self.cache_dir, 'hub' if single_model else 'datasets', id) + temp_directory = os.path.join( + self.cache_dir, 'hub' if single_model else 'datasets', + TEMPORARY_FOLDER_NAME, id) + entity_removed = self._remove_directory(entity_directory) + temp_removed = self._remove_directory(temp_directory) + if (not entity_removed) and (not temp_removed): + if single_model: + print( + f'Cache for Model {id} not found. Nothing to do.') + else: + print( + f'Cache for Dataset {id} not found. Nothing to do.' + ) + else: + print('Cache cleared.') + else: + print('Operation aborted.') + return + + def _remove_directory(self, path): + if os.path.exists(path): + try: + shutil.rmtree(path) + print(f'Cache folder {path} removed.') + return True + except Exception as e: + print(f'An error occurred while clearing cache at {path}: {e}') + return False diff --git a/modelscope/cli/cli.py b/modelscope/cli/cli.py index 5e3fcbfd..74fb05db 100644 --- a/modelscope/cli/cli.py +++ b/modelscope/cli/cli.py @@ -3,6 +3,7 @@ import argparse import logging +from modelscope.cli.clearcache import ClearCacheCMD from modelscope.cli.download import DownloadCMD from modelscope.cli.login import LoginCMD from modelscope.cli.modelcard import ModelCardCMD @@ -23,6 +24,7 @@ def run_cmd(): subparsers = parser.add_subparsers(help='modelscope commands helpers') DownloadCMD.define_args(subparsers) + ClearCacheCMD.define_args(subparsers) PluginsCMD.define_args(subparsers) PipelineCMD.define_args(subparsers) ModelCardCMD.define_args(subparsers) diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index 36204d93..5b3a8bb7 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -555,7 +555,7 @@ def get_module_without_script(self) -> DatasetModule: download_config = self.download_config.copy() if download_config.download_desc is None: - download_config.download_desc = 'Downloading readme' + download_config.download_desc = 'Downloading [README.md]' try: url_or_filename = _ms_api.get_dataset_file_url( file_name='README.md', @@ -989,7 +989,6 @@ class DatasetsWrapperHF: download_config=download_config, download_mode=download_mode, verification_mode=verification_mode, - try_from_hf_gcs=False, num_proc=num_proc, storage_options=storage_options, # base_path=builder_instance.base_path, diff --git a/modelscope/msdatasets/utils/hf_file_utils.py b/modelscope/msdatasets/utils/hf_file_utils.py index b2931f7e..863bb196 100644 --- a/modelscope/msdatasets/utils/hf_file_utils.py +++ b/modelscope/msdatasets/utils/hf_file_utils.py @@ -5,27 +5,138 @@ import json import os import re +import copy import shutil +import time import warnings -import inspect from contextlib import contextmanager from functools import partial from pathlib import Path +from typing import Optional, Union from urllib.parse import urljoin, urlparse import requests +from tqdm import tqdm from datasets import config -from datasets.utils.file_utils import hash_url_to_filename, get_authentication_headers_for_url, ftp_head, fsspec_head, \ - http_head, _raise_if_offline_mode_is_enabled, ftp_get, fsspec_get, http_get +from datasets.utils.file_utils import hash_url_to_filename, \ + get_authentication_headers_for_url, fsspec_head, fsspec_get from filelock import FileLock from modelscope.utils.config_ds import MS_DATASETS_CACHE from modelscope.utils.logger import get_logger from modelscope.hub.api import ModelScopeConfig +from modelscope import __version__ + logger = get_logger() +def get_datasets_user_agent_ms(user_agent: Optional[Union[str, dict]] = None) -> str: + ua = f'datasets/{__version__}' + ua += f'; python/{config.PY_VERSION}' + ua += f'; pyarrow/{config.PYARROW_VERSION}' + if config.TORCH_AVAILABLE: + ua += f'; torch/{config.TORCH_VERSION}' + if config.TF_AVAILABLE: + ua += f'; tensorflow/{config.TF_VERSION}' + if config.JAX_AVAILABLE: + ua += f'; jax/{config.JAX_VERSION}' + if isinstance(user_agent, dict): + ua += f"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}" + elif isinstance(user_agent, str): + ua += '; ' + user_agent + return ua + + +def _request_with_retry_ms( + method: str, + url: str, + max_retries: int = 2, + base_wait_time: float = 0.5, + max_wait_time: float = 2, + timeout: float = 10.0, + **params, +) -> requests.Response: + """Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff. + + Note that if the environment variable HF_DATASETS_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised. + + Args: + method (str): HTTP method, such as 'GET' or 'HEAD'. + url (str): The URL of the resource to fetch. + max_retries (int): Maximum number of retries, defaults to 0 (no retries). + base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between + retries then grows exponentially, capped by max_wait_time. + max_wait_time (float): Maximum amount of time between two retries, in seconds. + **params (additional keyword arguments): Params to pass to :obj:`requests.request`. + """ + tries, success = 0, False + response = None + while not success: + tries += 1 + try: + response = requests.request(method=method.upper(), url=url, timeout=timeout, **params) + success = True + except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as err: + if tries > max_retries: + raise err + else: + logger.info(f'{method} request to {url} timed out, retrying... [{tries/max_retries}]') + sleep_time = min(max_wait_time, base_wait_time * 2 ** (tries - 1)) # Exponential backoff + time.sleep(sleep_time) + return response + + +def http_head_ms( + url, proxies=None, headers=None, cookies=None, allow_redirects=True, timeout=10.0, max_retries=0 +) -> requests.Response: + headers = copy.deepcopy(headers) or {} + headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent')) + response = _request_with_retry_ms( + method='HEAD', + url=url, + proxies=proxies, + headers=headers, + cookies=cookies, + allow_redirects=allow_redirects, + timeout=timeout, + max_retries=max_retries, + ) + return response + + +def http_get_ms( + url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0, desc=None +) -> Optional[requests.Response]: + headers = dict(headers) if headers is not None else {} + headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent')) + if resume_size > 0: + headers['Range'] = f'bytes={resume_size:d}-' + response = _request_with_retry_ms( + method='GET', + url=url, + stream=True, + proxies=proxies, + headers=headers, + cookies=cookies, + max_retries=max_retries, + timeout=timeout, + ) + if temp_file is None: + return response + if response.status_code == 416: # Range not satisfiable + return + content_length = response.headers.get('Content-Length') + total = resume_size + int(content_length) if content_length is not None else None + + progress = tqdm(total=total, initial=resume_size, unit_scale=True, unit='B', desc=desc or 'Downloading') + for chunk in response.iter_content(chunk_size=1024): + progress.update(len(chunk)) + temp_file.write(chunk) + + progress.close() + + def get_from_cache_ms( url, cache_dir=None, @@ -42,7 +153,7 @@ def get_from_cache_ms( ignore_url_params=False, storage_options=None, download_desc=None, - disable_tqdm=False, + disable_tqdm=None, ) -> str: """ Given a URL, look for the corresponding file in the local cache. @@ -88,6 +199,8 @@ def get_from_cache_ms( # if we don't ask for 'force_download' then we spare a request filename = hash_url_to_filename(cached_url, etag=None) cache_path = os.path.join(cache_dir, filename) + if download_desc is None: + download_desc = 'Downloading [' + filename + ']' if os.path.exists(cache_path) and not force_download and not use_etag: return cache_path @@ -100,16 +213,14 @@ def get_from_cache_ms( # We don't have the file locally or we need an eTag if not local_files_only: scheme = urlparse(url).scheme - if scheme == 'ftp': - connected = ftp_head(url) - elif scheme not in ('http', 'https'): + if scheme not in ('http', 'https'): response = fsspec_head(url, storage_options=storage_options) # s3fs uses "ETag", gcsfs uses "etag" etag = (response.get('ETag', None) or response.get('etag', None)) if use_etag else None connected = True try: cookies = ModelScopeConfig.get_cookies() - response = http_head( + response = http_head_ms( url, allow_redirects=True, proxies=proxies, @@ -166,7 +277,6 @@ def get_from_cache_ms( ) elif response is not None and response.status_code == 404: raise FileNotFoundError(f"Couldn't find file at {url}") - _raise_if_offline_mode_is_enabled(f'Tried to reach {url}') if head_error is not None: raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})") elif response is not None: @@ -205,48 +315,21 @@ def get_from_cache_ms( # Download to temporary file, then copy to cache path once finished. # Otherwise, you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: - logger.info(f'Downloading to {temp_file.name}') # GET file object - if scheme == 'ftp': - ftp_get(url, temp_file) - elif scheme not in ('http', 'https'): - fsspec_get_sig = inspect.signature(fsspec_get) - if 'disable_tqdm' in fsspec_get_sig.parameters: - fsspec_get(url, - temp_file, - storage_options=storage_options, - desc=download_desc, - disable_tqdm=disable_tqdm - ) - else: - fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc) + if scheme not in ('http', 'https'): + fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc) else: - http_get_sig = inspect.signature(http_get) - - if 'disable_tqdm' in http_get_sig.parameters: - http_get( - url, - temp_file=temp_file, - proxies=proxies, - resume_size=resume_size, - headers=headers, - cookies=cookies, - max_retries=max_retries, - desc=download_desc, - disable_tqdm=disable_tqdm, - ) - else: - http_get( - url, - temp_file=temp_file, - proxies=proxies, - resume_size=resume_size, - headers=headers, - cookies=cookies, - max_retries=max_retries, - desc=download_desc, - ) + http_get_ms( + url, + temp_file=temp_file, + proxies=proxies, + resume_size=resume_size, + headers=headers, + cookies=cookies, + max_retries=max_retries, + desc=download_desc, + ) logger.info(f'storing {url} in cache at {cache_path}') shutil.move(temp_file.name, cache_path) diff --git a/modelscope/preprocessors/templates/__init__.py b/modelscope/preprocessors/templates/__init__.py new file mode 100644 index 00000000..5ac1780d --- /dev/null +++ b/modelscope/preprocessors/templates/__init__.py @@ -0,0 +1,2 @@ +from .base import Template, get_template +from .template import TemplateType diff --git a/modelscope/preprocessors/templates/base.py b/modelscope/preprocessors/templates/base.py new file mode 100644 index 00000000..4504a4bc --- /dev/null +++ b/modelscope/preprocessors/templates/base.py @@ -0,0 +1,1041 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import json +import re +from copy import deepcopy +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from modelscope import get_logger +from torch.nn import Module +from torch.nn.utils.rnn import pad_sequence +from transformers import PreTrainedTokenizerBase, StoppingCriteria +from .loss_scale import loss_scale_map +from .tools_prompt import get_tools_prompt +from .utils import load_batch, load_image, rescale_image, fetch_one, to_device, decode_base64 +from .utils import History, Prompt, StopWords, Context, Messages + +logger = get_logger() + +DEFAULT_SYSTEM = 'You are a helpful assistant.' + +TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} + + +def get_template( + template_type: str, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs, +) -> 'Template': + template_info = TEMPLATE_MAPPING[template_type] + template = deepcopy(template_info['template']) + template.init_template(tokenizer, default_system, max_length, truncation_strategy, **kwargs) + return template + + +def _findall(token_list: List[int], sub_token_list: Union[int, List[int]]) -> List[int]: + """Find the index of a token in the token_list.""" + if isinstance(sub_token_list, int): + sub_token_list = [sub_token_list] + res = [] + idx = -1 + try: + while True: + idx = token_list.index(sub_token_list[0], idx + 1) + if len(sub_token_list) == 1 or sub_token_list == token_list[idx:idx + len(sub_token_list)]: + res.append(idx) + except ValueError: + pass + return res + + +def replace_img_tag(messages: Messages, + replace_token: str, + pattern=r'(.+?)') -> Tuple[str, History, List[str]]: + images_path = [] + new_messages = [] + for i, m in enumerate(messages): + m = m.copy() + if m['content'] is None or m['role'] in ('tool', 'system', 'assistant'): + new_messages.append(m) + else: + images_path += re.findall(pattern, m['content']) + m['content'] = re.sub(pattern, replace_token, m['content']) + new_messages.append(m) + return messages, images_path + + +class StopWordsCriteria(StoppingCriteria): + """Adding extra stop words in template to prevent unstoppable generation + Like suffixes and chat seps in the template. + """ + def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_words: StopWords, **tokenizer_kwargs) -> None: + self.tokenizer = tokenizer + self.stop_words = stop_words + self.tokenizer_kwargs = tokenizer_kwargs + self.start_idx = -1 + + def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor, **kwargs) -> bool: + if self.start_idx == -1: + self.start_idx = len(input_ids[0]) - 1 + tokenizer = self.tokenizer + stop_words = self.stop_words + # [-20:]: Assuming the end tokens do not exceed 20 tokens, + # to avoid input_ids being too long and affecting efficiency. + text = tokenizer.decode(input_ids[0, self.start_idx:][-20:], **self.tokenizer_kwargs) + for stop_word in stop_words: + if isinstance(stop_word, str): + if stop_word in text: + return True + else: # list + if len(stop_word) > 0 and input_ids[0].tolist()[-len(stop_word):] == stop_word: + return True + return False + + +class Template: + """A template class for all supported models. + + Args: + prefix: Prefix tokens before the first turn's prompt + prompt: A list of elements whose types are str and list of integers. The input query part of every turn. + chat_sep: The chat separators between every turn. + suffix: The end tokens after the chat finished. + default_system: A default system instruction. + system_prefix: The prefix if the `system` is not empty. + auto_add_bos: By default, the bos_token is not added. The auto_add_bos option will determine + whether to add it based on `tokenizer.encode('')`. + tools_prompt: The tools prompt name + tool_prompt: The tool prompt, usually useful when there is a tool role + padding_side: The padding side + infer_media_type: The media type supported by the multi-modals + Examples: + system\nYou are a helpful assistant!\nWho are you?\nassistant:I am a robot\nWho are you?\nassistant:I am a robot # noqa + ----------system------------ ---query---- --response- -----chatsep----- ---query--- --response- ----suffix----- + ----------------------------system_prefix---------------------------- ---------------------------- prompt ------------------------------------- ---------------------------- prompt ------------------------------------- + + """ + + special_tokens = ['', '