Merge commit 'bd1544bef0945677bfd6c0ac2b24f353f2f0817d' into feat/template

* commit 'bd1544bef0945677bfd6c0ac2b24f353f2f0817d':
  fix template: llava-llama-3 & yi-1.5 (#1011)
  add cmd line option of clear-cache (#1009)
  do not download pt and pth files for autoconfig, autotoknizer and generation config (#1008)
  Update issue templates
  Adapt new datasets (#1002)
  template and ollama in modelscope (#995)
  Unify dataset download log and remove tqdm disable option (#997)

# Conflicts:
#	modelscope/preprocessors/templates/loader.py
#	tests/tools/test_to_ollama.py
This commit is contained in:
yuze.zyz
2024-10-10 17:49:13 +08:00
13 changed files with 293 additions and 69 deletions

View File

@@ -3,7 +3,7 @@ name: Bug report
about: Create a bug report to help us improve
title: ''
labels: ''
assignees: Firmament-cyou, tastelikefeet, wangxingjun778, wenmengzhou, zzclynn
assignees: tastelikefeet, wangxingjun778, yingdachen
---
@@ -36,14 +36,14 @@ A clear and concise description of what the bug is.
Please @ corresponding people according to your problem:
Model related: @wenmengzhou @tastelikefeet
Model related: @tastelikefeet
Model hub related: @liuyhwangyh
Model hub related: @liuyhwangyh @tastelikefeet @wangxingjun778
Dataset releated: @wangxingjun778
Finetune related: @tastelikefeet @Jintao-Huang
Pipeline related: @Firmament-cyou @wenmengzhou
Pipeline related: @tastelikefeet @wangxingjun778
Contribute your model: @zzclynn
Contribute your model: @yingdachen

View File

@@ -3,7 +3,7 @@ name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: tastelikefeet, wangxingjun778, wenmengzhou, yingdachen, zzclynn
assignees: yingdachen, wangxingjun778, tastelikefeet
---

View File

@@ -3,7 +3,7 @@ name: Question
about: Describe this issue template's purpose here.
title: ''
labels: ''
assignees: zzclynn,wenmengzhou
assignees: tastelikefeet, wangxingjun778, yingdachen
---
@@ -18,7 +18,7 @@ Before asking a question, make sure you have:
Please @ corresponding people according to your problem:
Model related: @wenmengzhou @tastelikefeet
Model related: @tastelikefeet
Model hub related: @liuyhwangyh
@@ -26,6 +26,6 @@ Dataset releated: @wangxingjun778
Finetune related: @tastelikefeet @Jintao-Huang
Pipeline related: @Firmament-cyou @wenmengzhou
Pipeline related: @tastelikefeet @wangxingjun778
Contribute your model: @zzclynn
Contribute your model: @yingdachen

View File

@@ -0,0 +1,107 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import shutil
from argparse import ArgumentParser
from pathlib import Path
from modelscope.cli.base import CLICommand
from modelscope.hub.constants import TEMPORARY_FOLDER_NAME
def subparser_func(args):
""" Function which will be called for a specific sub parser.
"""
return ClearCacheCMD(args)
class ClearCacheCMD(CLICommand):
name = 'clear-cache'
def __init__(self, args):
self.args = args
self.cache_dir = os.getenv(
'MODELSCOPE_CACHE',
Path.home().joinpath('.cache', 'modelscope'))
@staticmethod
def define_args(parsers: ArgumentParser):
""" define args for clear-cache command.
"""
parser = parsers.add_parser(ClearCacheCMD.name)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--model',
type=str,
help=
'The id of the model whose cache will be cleared. For clear-cache, '
'if neither model or dataset id is provided, entire cache will be cleared.'
)
group.add_argument(
'--dataset',
type=str,
help=
'The id of the dataset whose cache will be cleared. For clear-cache, '
'if neither model or dataset id is provided, entire cache will be cleared.'
)
parser.set_defaults(func=subparser_func)
def execute(self):
self._execute_with_confirmation()
def _execute_with_confirmation(self):
all = False
single_model = False
prompt = '\nYou are about to delete '
if self.args.model or self.args.dataset:
if self.args.model:
id = self.args.model
single_model = True
prompt = prompt + f'local cache for model {id}. '
else:
id = self.args.dataset
prompt = prompt + f'local cache for dataset {id}. '
else:
prompt = prompt + f'entire ModelScope cache at {self.cache_dir}, including ALL models and dataset.\n'
all = True
user_input = input(
prompt
+ '\nPlease press Y or y to proceed, any other key to abort.\n'
).strip().upper()
if user_input == 'Y':
if all:
self._remove_directory(self.cache_dir)
print('Cache cleared.')
else:
entity_directory = os.path.join(
self.cache_dir, 'hub' if single_model else 'datasets', id)
temp_directory = os.path.join(
self.cache_dir, 'hub' if single_model else 'datasets',
TEMPORARY_FOLDER_NAME, id)
entity_removed = self._remove_directory(entity_directory)
temp_removed = self._remove_directory(temp_directory)
if (not entity_removed) and (not temp_removed):
if single_model:
print(
f'Cache for Model {id} not found. Nothing to do.')
else:
print(
f'Cache for Dataset {id} not found. Nothing to do.'
)
else:
print('Cache cleared.')
else:
print('Operation aborted.')
return
def _remove_directory(self, path):
if os.path.exists(path):
try:
shutil.rmtree(path)
print(f'Cache folder {path} removed.')
return True
except Exception as e:
print(f'An error occurred while clearing cache at {path}: {e}')
return False

View File

@@ -3,6 +3,7 @@
import argparse
import logging
from modelscope.cli.clearcache import ClearCacheCMD
from modelscope.cli.download import DownloadCMD
from modelscope.cli.login import LoginCMD
from modelscope.cli.modelcard import ModelCardCMD
@@ -23,6 +24,7 @@ def run_cmd():
subparsers = parser.add_subparsers(help='modelscope commands helpers')
DownloadCMD.define_args(subparsers)
ClearCacheCMD.define_args(subparsers)
PluginsCMD.define_args(subparsers)
PipelineCMD.define_args(subparsers)
ModelCardCMD.define_args(subparsers)

View File

@@ -555,7 +555,7 @@ def get_module_without_script(self) -> DatasetModule:
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = 'Downloading readme'
download_config.download_desc = 'Downloading [README.md]'
try:
url_or_filename = _ms_api.get_dataset_file_url(
file_name='README.md',
@@ -989,7 +989,6 @@ class DatasetsWrapperHF:
download_config=download_config,
download_mode=download_mode,
verification_mode=verification_mode,
try_from_hf_gcs=False,
num_proc=num_proc,
storage_options=storage_options,
# base_path=builder_instance.base_path,

View File

@@ -5,27 +5,138 @@
import json
import os
import re
import copy
import shutil
import time
import warnings
import inspect
from contextlib import contextmanager
from functools import partial
from pathlib import Path
from typing import Optional, Union
from urllib.parse import urljoin, urlparse
import requests
from tqdm import tqdm
from datasets import config
from datasets.utils.file_utils import hash_url_to_filename, get_authentication_headers_for_url, ftp_head, fsspec_head, \
http_head, _raise_if_offline_mode_is_enabled, ftp_get, fsspec_get, http_get
from datasets.utils.file_utils import hash_url_to_filename, \
get_authentication_headers_for_url, fsspec_head, fsspec_get
from filelock import FileLock
from modelscope.utils.config_ds import MS_DATASETS_CACHE
from modelscope.utils.logger import get_logger
from modelscope.hub.api import ModelScopeConfig
from modelscope import __version__
logger = get_logger()
def get_datasets_user_agent_ms(user_agent: Optional[Union[str, dict]] = None) -> str:
ua = f'datasets/{__version__}'
ua += f'; python/{config.PY_VERSION}'
ua += f'; pyarrow/{config.PYARROW_VERSION}'
if config.TORCH_AVAILABLE:
ua += f'; torch/{config.TORCH_VERSION}'
if config.TF_AVAILABLE:
ua += f'; tensorflow/{config.TF_VERSION}'
if config.JAX_AVAILABLE:
ua += f'; jax/{config.JAX_VERSION}'
if isinstance(user_agent, dict):
ua += f"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}"
elif isinstance(user_agent, str):
ua += '; ' + user_agent
return ua
def _request_with_retry_ms(
method: str,
url: str,
max_retries: int = 2,
base_wait_time: float = 0.5,
max_wait_time: float = 2,
timeout: float = 10.0,
**params,
) -> requests.Response:
"""Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff.
Note that if the environment variable HF_DATASETS_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.
Args:
method (str): HTTP method, such as 'GET' or 'HEAD'.
url (str): The URL of the resource to fetch.
max_retries (int): Maximum number of retries, defaults to 0 (no retries).
base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between
retries then grows exponentially, capped by max_wait_time.
max_wait_time (float): Maximum amount of time between two retries, in seconds.
**params (additional keyword arguments): Params to pass to :obj:`requests.request`.
"""
tries, success = 0, False
response = None
while not success:
tries += 1
try:
response = requests.request(method=method.upper(), url=url, timeout=timeout, **params)
success = True
except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as err:
if tries > max_retries:
raise err
else:
logger.info(f'{method} request to {url} timed out, retrying... [{tries/max_retries}]')
sleep_time = min(max_wait_time, base_wait_time * 2 ** (tries - 1)) # Exponential backoff
time.sleep(sleep_time)
return response
def http_head_ms(
url, proxies=None, headers=None, cookies=None, allow_redirects=True, timeout=10.0, max_retries=0
) -> requests.Response:
headers = copy.deepcopy(headers) or {}
headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent'))
response = _request_with_retry_ms(
method='HEAD',
url=url,
proxies=proxies,
headers=headers,
cookies=cookies,
allow_redirects=allow_redirects,
timeout=timeout,
max_retries=max_retries,
)
return response
def http_get_ms(
url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0, desc=None
) -> Optional[requests.Response]:
headers = dict(headers) if headers is not None else {}
headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent'))
if resume_size > 0:
headers['Range'] = f'bytes={resume_size:d}-'
response = _request_with_retry_ms(
method='GET',
url=url,
stream=True,
proxies=proxies,
headers=headers,
cookies=cookies,
max_retries=max_retries,
timeout=timeout,
)
if temp_file is None:
return response
if response.status_code == 416: # Range not satisfiable
return
content_length = response.headers.get('Content-Length')
total = resume_size + int(content_length) if content_length is not None else None
progress = tqdm(total=total, initial=resume_size, unit_scale=True, unit='B', desc=desc or 'Downloading')
for chunk in response.iter_content(chunk_size=1024):
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def get_from_cache_ms(
url,
cache_dir=None,
@@ -42,7 +153,7 @@ def get_from_cache_ms(
ignore_url_params=False,
storage_options=None,
download_desc=None,
disable_tqdm=False,
disable_tqdm=None,
) -> str:
"""
Given a URL, look for the corresponding file in the local cache.
@@ -88,6 +199,8 @@ def get_from_cache_ms(
# if we don't ask for 'force_download' then we spare a request
filename = hash_url_to_filename(cached_url, etag=None)
cache_path = os.path.join(cache_dir, filename)
if download_desc is None:
download_desc = 'Downloading [' + filename + ']'
if os.path.exists(cache_path) and not force_download and not use_etag:
return cache_path
@@ -100,16 +213,14 @@ def get_from_cache_ms(
# We don't have the file locally or we need an eTag
if not local_files_only:
scheme = urlparse(url).scheme
if scheme == 'ftp':
connected = ftp_head(url)
elif scheme not in ('http', 'https'):
if scheme not in ('http', 'https'):
response = fsspec_head(url, storage_options=storage_options)
# s3fs uses "ETag", gcsfs uses "etag"
etag = (response.get('ETag', None) or response.get('etag', None)) if use_etag else None
connected = True
try:
cookies = ModelScopeConfig.get_cookies()
response = http_head(
response = http_head_ms(
url,
allow_redirects=True,
proxies=proxies,
@@ -166,7 +277,6 @@ def get_from_cache_ms(
)
elif response is not None and response.status_code == 404:
raise FileNotFoundError(f"Couldn't find file at {url}")
_raise_if_offline_mode_is_enabled(f'Tried to reach {url}')
if head_error is not None:
raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})")
elif response is not None:
@@ -205,48 +315,21 @@ def get_from_cache_ms(
# Download to temporary file, then copy to cache path once finished.
# Otherwise, you get corrupt cache entries if the download gets interrupted.
with temp_file_manager() as temp_file:
logger.info(f'Downloading to {temp_file.name}')
# GET file object
if scheme == 'ftp':
ftp_get(url, temp_file)
elif scheme not in ('http', 'https'):
fsspec_get_sig = inspect.signature(fsspec_get)
if 'disable_tqdm' in fsspec_get_sig.parameters:
fsspec_get(url,
temp_file,
storage_options=storage_options,
desc=download_desc,
disable_tqdm=disable_tqdm
)
else:
fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc)
if scheme not in ('http', 'https'):
fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc)
else:
http_get_sig = inspect.signature(http_get)
if 'disable_tqdm' in http_get_sig.parameters:
http_get(
url,
temp_file=temp_file,
proxies=proxies,
resume_size=resume_size,
headers=headers,
cookies=cookies,
max_retries=max_retries,
desc=download_desc,
disable_tqdm=disable_tqdm,
)
else:
http_get(
url,
temp_file=temp_file,
proxies=proxies,
resume_size=resume_size,
headers=headers,
cookies=cookies,
max_retries=max_retries,
desc=download_desc,
)
http_get_ms(
url,
temp_file=temp_file,
proxies=proxies,
resume_size=resume_size,
headers=headers,
cookies=cookies,
max_retries=max_retries,
desc=download_desc,
)
logger.info(f'storing {url} in cache at {cache_path}')
shutil.move(temp_file.name, cache_path)

View File

@@ -83,7 +83,7 @@ template_info = [
TemplateInfo(
template=TemplateType.chatml,
template_regex=
f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*{chat_suffix}.*',
f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*',
modelfile_link=
'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-1.5.modelfile',
),
@@ -110,6 +110,10 @@ template_info = [
'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/glm4.modelfile',
),
TemplateInfo(
template_regex=f'.*{cases("llava-llama-3")}.*',
modelfile_link='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava-llama-3.modelfile'),
# baichuan
TemplateInfo(
template=TemplateType.baichuan,

View File

@@ -127,7 +127,9 @@ def _patch_pretrained_class():
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
**kwargs):
ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
ignore_file_pattern = [
r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
]
model_dir = get_model_dir(pretrained_model_name_or_path,
ignore_file_pattern, **kwargs)
return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
@@ -143,14 +145,18 @@ def _patch_pretrained_class():
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
**kwargs):
ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
ignore_file_pattern = [
r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
]
model_dir = get_model_dir(pretrained_model_name_or_path,
ignore_file_pattern, **kwargs)
return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
@classmethod
def get_config_dict(cls, pretrained_model_name_or_path, **kwargs):
ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
ignore_file_pattern = [
r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
]
model_dir = get_model_dir(pretrained_model_name_or_path,
ignore_file_pattern, **kwargs)
return ori_get_config_dict(cls, model_dir, **kwargs)
@@ -242,11 +248,20 @@ AutoModelForTokenClassification = get_wrapped_class(
AutoModelForTokenClassificationHF)
AutoTokenizer = get_wrapped_class(
AutoTokenizerHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
AutoTokenizerHF,
ignore_file_pattern=[
r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
])
AutoConfig = get_wrapped_class(
AutoConfigHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
AutoConfigHF,
ignore_file_pattern=[
r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
])
GenerationConfig = get_wrapped_class(
GenerationConfigHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
GenerationConfigHF,
ignore_file_pattern=[
r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
])
GPTQConfig = GPTQConfigHF
AwqConfig = AwqConfigHF
BitsAndBytesConfig = BitsAndBytesConfigHF

View File

@@ -1,6 +1,6 @@
addict
attrs
datasets>=2.18.0,<3.0.0
datasets>=3.0.0
einops
oss2
Pillow

View File

@@ -1,6 +1,6 @@
addict
attrs
datasets>=2.18.0,<3.0.0
datasets>=3.0.0
einops
oss2
Pillow

View File

@@ -44,6 +44,15 @@ class TestStreamLoad(unittest.TestCase):
assert sample['question'], f'Failed to load sample from {repo_id}'
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_stream_swift_jsonl(self):
repo_id: str = 'iic/MSAgent-MultiRole'
ds = MsDataset.load(repo_id, split='train', use_streaming=True)
sample = next(iter(ds))
logger.info(sample)
assert sample['id'], f'Failed to load sample from {repo_id}'
if __name__ == '__main__':
unittest.main()

View File

@@ -100,6 +100,11 @@ class TestToOllama(unittest.TestCase):
ollama = TemplateLoader.to_ollama(
'QuantFactory/Mistral-Nemo-Japanese-Instruct-2408-GGUF')
self.assertTrue(ollama is not None)
ollama = TemplateLoader.to_ollama('AI-ModelScope/Yi-1.5-9B-32K-GGUF')
self.assertTrue(ollama is not None)
ollama = TemplateLoader.to_ollama(
'AI-ModelScope/llava-llama-3-8b-v1_1-gguf')
self.assertTrue(ollama is not None)
if __name__ == '__main__':