Merge commit 'bd1544bef0945677bfd6c0ac2b24f353f2f0817d' into feat/template

* commit 'bd1544bef0945677bfd6c0ac2b24f353f2f0817d': fix template: llava-llama-3 & yi-1.5 (#1011) add cmd line option of clear-cache (#1009) do not download pt and pth files for autoconfig, autotoknizer and generation config (#1008) Update issue templates Adapt new datasets (#1002) template and ollama in modelscope (#995) Unify dataset download log and remove tqdm disable option (#997) # Conflicts: # modelscope/preprocessors/templates/loader.py # tests/tools/test_to_ollama.py
2026-02-24 12:10:09 +01:00 · 2024-10-10 17:49:13 +08:00
parent 5c6454ba76 bd1544bef0
commit ba7a783f23
13 changed files with 293 additions and 69 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -3,7 +3,7 @@ name: Bug report
 about: Create a bug report to help us improve
 title: ''
 labels: ''
-assignees: Firmament-cyou, tastelikefeet, wangxingjun778, wenmengzhou, zzclynn
+assignees: tastelikefeet, wangxingjun778, yingdachen

 ---

@@ -36,14 +36,14 @@ A clear and concise description of what the bug is.

 Please @ corresponding people according to your problem:

-Model related: @wenmengzhou @tastelikefeet
+Model related:  @tastelikefeet

-Model hub related: @liuyhwangyh
+Model hub related: @liuyhwangyh @tastelikefeet @wangxingjun778

 Dataset releated: @wangxingjun778

 Finetune related: @tastelikefeet  @Jintao-Huang

-Pipeline related: @Firmament-cyou @wenmengzhou
+Pipeline related: @tastelikefeet @wangxingjun778

-Contribute your model: @zzclynn
+Contribute your model: @yingdachen
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -3,7 +3,7 @@ name: Feature request
 about: Suggest an idea for this project
 title: ''
 labels: ''
-assignees: tastelikefeet, wangxingjun778, wenmengzhou, yingdachen, zzclynn
+assignees: yingdachen, wangxingjun778, tastelikefeet

 ---

--- a/.github/ISSUE_TEMPLATE/question.md
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -3,7 +3,7 @@ name: Question
 about: Describe this issue template's purpose here.
 title: ''
 labels: ''
-assignees: zzclynn,wenmengzhou
+assignees: tastelikefeet, wangxingjun778, yingdachen

 ---

@@ -18,7 +18,7 @@ Before asking a question, make sure you have:

 Please @ corresponding people according to your problem:

-Model related: @wenmengzhou @tastelikefeet
+Model related:  @tastelikefeet

 Model hub related: @liuyhwangyh

@@ -26,6 +26,6 @@ Dataset releated: @wangxingjun778

 Finetune related: @tastelikefeet  @Jintao-Huang

-Pipeline related: @Firmament-cyou @wenmengzhou
+Pipeline related: @tastelikefeet @wangxingjun778

-Contribute your model: @zzclynn
+Contribute your model: @yingdachen
--- a/modelscope/cli/clearcache.py
+++ b/modelscope/cli/clearcache.py
@@ -0,0 +1,107 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+from argparse import ArgumentParser
+from pathlib import Path
+
+from modelscope.cli.base import CLICommand
+from modelscope.hub.constants import TEMPORARY_FOLDER_NAME
+
+
+def subparser_func(args):
+    """ Function which will be called for a specific sub parser.
+    """
+    return ClearCacheCMD(args)
+
+
+class ClearCacheCMD(CLICommand):
+    name = 'clear-cache'
+
+    def __init__(self, args):
+        self.args = args
+        self.cache_dir = os.getenv(
+            'MODELSCOPE_CACHE',
+            Path.home().joinpath('.cache', 'modelscope'))
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for clear-cache command.
+        """
+        parser = parsers.add_parser(ClearCacheCMD.name)
+        group = parser.add_mutually_exclusive_group()
+        group.add_argument(
+            '--model',
+            type=str,
+            help=
+            'The id of the model whose cache will be cleared. For clear-cache, '
+            'if neither model or dataset id is provided, entire cache will be cleared.'
+        )
+        group.add_argument(
+            '--dataset',
+            type=str,
+            help=
+            'The id of the dataset whose cache will be cleared. For clear-cache, '
+            'if neither model or dataset id is provided, entire cache will be cleared.'
+        )
+
+        parser.set_defaults(func=subparser_func)
+
+    def execute(self):
+        self._execute_with_confirmation()
+
+    def _execute_with_confirmation(self):
+        all = False
+        single_model = False
+        prompt = '\nYou are about to delete '
+
+        if self.args.model or self.args.dataset:
+            if self.args.model:
+                id = self.args.model
+                single_model = True
+                prompt = prompt + f'local cache for model {id}. '
+            else:
+                id = self.args.dataset
+                prompt = prompt + f'local cache for dataset {id}. '
+        else:
+            prompt = prompt + f'entire ModelScope cache at {self.cache_dir}, including ALL models and dataset.\n'
+            all = True
+        user_input = input(
+            prompt
+            + '\nPlease press Y or y to proceed, any other key to abort.\n'
+        ).strip().upper()
+
+        if user_input == 'Y':
+            if all:
+                self._remove_directory(self.cache_dir)
+                print('Cache cleared.')
+            else:
+                entity_directory = os.path.join(
+                    self.cache_dir, 'hub' if single_model else 'datasets', id)
+                temp_directory = os.path.join(
+                    self.cache_dir, 'hub' if single_model else 'datasets',
+                    TEMPORARY_FOLDER_NAME, id)
+                entity_removed = self._remove_directory(entity_directory)
+                temp_removed = self._remove_directory(temp_directory)
+                if (not entity_removed) and (not temp_removed):
+                    if single_model:
+                        print(
+                            f'Cache for Model {id} not found. Nothing to do.')
+                    else:
+                        print(
+                            f'Cache for Dataset {id} not found. Nothing to do.'
+                        )
+                else:
+                    print('Cache cleared.')
+        else:
+            print('Operation aborted.')
+            return
+
+    def _remove_directory(self, path):
+        if os.path.exists(path):
+            try:
+                shutil.rmtree(path)
+                print(f'Cache folder {path} removed.')
+                return True
+            except Exception as e:
+                print(f'An error occurred while clearing cache at {path}: {e}')
+            return False
--- a/modelscope/cli/cli.py
+++ b/modelscope/cli/cli.py
@@ -3,6 +3,7 @@
 import argparse
 import logging

+from modelscope.cli.clearcache import ClearCacheCMD
 from modelscope.cli.download import DownloadCMD
 from modelscope.cli.login import LoginCMD
 from modelscope.cli.modelcard import ModelCardCMD
@@ -23,6 +24,7 @@ def run_cmd():
    subparsers = parser.add_subparsers(help='modelscope commands helpers')

    DownloadCMD.define_args(subparsers)
+    ClearCacheCMD.define_args(subparsers)
    PluginsCMD.define_args(subparsers)
    PipelineCMD.define_args(subparsers)
    ModelCardCMD.define_args(subparsers)
--- a/modelscope/msdatasets/utils/hf_datasets_util.py
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
@@ -555,7 +555,7 @@ def get_module_without_script(self) -> DatasetModule:

    download_config = self.download_config.copy()
    if download_config.download_desc is None:
-        download_config.download_desc = 'Downloading readme'
+        download_config.download_desc = 'Downloading [README.md]'
    try:
        url_or_filename = _ms_api.get_dataset_file_url(
            file_name='README.md',
@@ -989,7 +989,6 @@ class DatasetsWrapperHF:
            download_config=download_config,
            download_mode=download_mode,
            verification_mode=verification_mode,
-            try_from_hf_gcs=False,
            num_proc=num_proc,
            storage_options=storage_options,
            # base_path=builder_instance.base_path,
--- a/modelscope/msdatasets/utils/hf_file_utils.py
+++ b/modelscope/msdatasets/utils/hf_file_utils.py
@@ -5,27 +5,138 @@
 import json
 import os
 import re
+import copy
 import shutil
+import time
 import warnings
-import inspect
 from contextlib import contextmanager
 from functools import partial
 from pathlib import Path
+from typing import Optional, Union
 from urllib.parse import urljoin, urlparse
 import requests
+from tqdm import tqdm

 from datasets import config
-from datasets.utils.file_utils import hash_url_to_filename, get_authentication_headers_for_url, ftp_head, fsspec_head, \
-    http_head, _raise_if_offline_mode_is_enabled, ftp_get, fsspec_get, http_get
+from datasets.utils.file_utils import hash_url_to_filename, \
+    get_authentication_headers_for_url, fsspec_head, fsspec_get
 from filelock import FileLock

 from modelscope.utils.config_ds import MS_DATASETS_CACHE
 from modelscope.utils.logger import get_logger
 from modelscope.hub.api import ModelScopeConfig

+from modelscope import __version__
+
 logger = get_logger()


+def get_datasets_user_agent_ms(user_agent: Optional[Union[str, dict]] = None) -> str:
+    ua = f'datasets/{__version__}'
+    ua += f'; python/{config.PY_VERSION}'
+    ua += f'; pyarrow/{config.PYARROW_VERSION}'
+    if config.TORCH_AVAILABLE:
+        ua += f'; torch/{config.TORCH_VERSION}'
+    if config.TF_AVAILABLE:
+        ua += f'; tensorflow/{config.TF_VERSION}'
+    if config.JAX_AVAILABLE:
+        ua += f'; jax/{config.JAX_VERSION}'
+    if isinstance(user_agent, dict):
+        ua += f"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}"
+    elif isinstance(user_agent, str):
+        ua += '; ' + user_agent
+    return ua
+
+
+def _request_with_retry_ms(
+    method: str,
+    url: str,
+    max_retries: int = 2,
+    base_wait_time: float = 0.5,
+    max_wait_time: float = 2,
+    timeout: float = 10.0,
+    **params,
+) -> requests.Response:
+    """Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff.
+
+    Note that if the environment variable HF_DATASETS_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.
+
+    Args:
+        method (str): HTTP method, such as 'GET' or 'HEAD'.
+        url (str): The URL of the resource to fetch.
+        max_retries (int): Maximum number of retries, defaults to 0 (no retries).
+        base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between
+            retries then grows exponentially, capped by max_wait_time.
+        max_wait_time (float): Maximum amount of time between two retries, in seconds.
+        **params (additional keyword arguments): Params to pass to :obj:`requests.request`.
+    """
+    tries, success = 0, False
+    response = None
+    while not success:
+        tries += 1
+        try:
+            response = requests.request(method=method.upper(), url=url, timeout=timeout, **params)
+            success = True
+        except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as err:
+            if tries > max_retries:
+                raise err
+            else:
+                logger.info(f'{method} request to {url} timed out, retrying... [{tries/max_retries}]')
+                sleep_time = min(max_wait_time, base_wait_time * 2 ** (tries - 1))  # Exponential backoff
+                time.sleep(sleep_time)
+    return response
+
+
+def http_head_ms(
+    url, proxies=None, headers=None, cookies=None, allow_redirects=True, timeout=10.0, max_retries=0
+) -> requests.Response:
+    headers = copy.deepcopy(headers) or {}
+    headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent'))
+    response = _request_with_retry_ms(
+        method='HEAD',
+        url=url,
+        proxies=proxies,
+        headers=headers,
+        cookies=cookies,
+        allow_redirects=allow_redirects,
+        timeout=timeout,
+        max_retries=max_retries,
+    )
+    return response
+
+
+def http_get_ms(
+    url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0, desc=None
+) -> Optional[requests.Response]:
+    headers = dict(headers) if headers is not None else {}
+    headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent'))
+    if resume_size > 0:
+        headers['Range'] = f'bytes={resume_size:d}-'
+    response = _request_with_retry_ms(
+        method='GET',
+        url=url,
+        stream=True,
+        proxies=proxies,
+        headers=headers,
+        cookies=cookies,
+        max_retries=max_retries,
+        timeout=timeout,
+    )
+    if temp_file is None:
+        return response
+    if response.status_code == 416:  # Range not satisfiable
+        return
+    content_length = response.headers.get('Content-Length')
+    total = resume_size + int(content_length) if content_length is not None else None
+
+    progress = tqdm(total=total, initial=resume_size, unit_scale=True, unit='B', desc=desc or 'Downloading')
+    for chunk in response.iter_content(chunk_size=1024):
+        progress.update(len(chunk))
+        temp_file.write(chunk)
+
+    progress.close()
+
+
 def get_from_cache_ms(
    url,
    cache_dir=None,
@@ -42,7 +153,7 @@ def get_from_cache_ms(
    ignore_url_params=False,
    storage_options=None,
    download_desc=None,
-    disable_tqdm=False,
+    disable_tqdm=None,
 ) -> str:
    """
    Given a URL, look for the corresponding file in the local cache.
@@ -88,6 +199,8 @@ def get_from_cache_ms(
    # if we don't ask for 'force_download' then we spare a request
    filename = hash_url_to_filename(cached_url, etag=None)
    cache_path = os.path.join(cache_dir, filename)
+    if download_desc is None:
+        download_desc = 'Downloading [' + filename + ']'

    if os.path.exists(cache_path) and not force_download and not use_etag:
        return cache_path
@@ -100,16 +213,14 @@ def get_from_cache_ms(
    # We don't have the file locally or we need an eTag
    if not local_files_only:
        scheme = urlparse(url).scheme
-        if scheme == 'ftp':
-            connected = ftp_head(url)
-        elif scheme not in ('http', 'https'):
+        if scheme not in ('http', 'https'):
            response = fsspec_head(url, storage_options=storage_options)
            # s3fs uses "ETag", gcsfs uses "etag"
            etag = (response.get('ETag', None) or response.get('etag', None)) if use_etag else None
            connected = True
        try:
            cookies = ModelScopeConfig.get_cookies()
-            response = http_head(
+            response = http_head_ms(
                url,
                allow_redirects=True,
                proxies=proxies,
@@ -166,7 +277,6 @@ def get_from_cache_ms(
            )
        elif response is not None and response.status_code == 404:
            raise FileNotFoundError(f"Couldn't find file at {url}")
-        _raise_if_offline_mode_is_enabled(f'Tried to reach {url}')
        if head_error is not None:
            raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})")
        elif response is not None:
@@ -205,48 +315,21 @@ def get_from_cache_ms(
        # Download to temporary file, then copy to cache path once finished.
        # Otherwise, you get corrupt cache entries if the download gets interrupted.
        with temp_file_manager() as temp_file:
-            logger.info(f'Downloading to {temp_file.name}')

            # GET file object
-            if scheme == 'ftp':
-                ftp_get(url, temp_file)
-            elif scheme not in ('http', 'https'):
-                fsspec_get_sig = inspect.signature(fsspec_get)
-                if 'disable_tqdm' in fsspec_get_sig.parameters:
-                    fsspec_get(url,
-                               temp_file,
-                               storage_options=storage_options,
-                               desc=download_desc,
-                               disable_tqdm=disable_tqdm
-                               )
-                else:
-                    fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc)
+            if scheme not in ('http', 'https'):
+                fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc)
            else:
-                http_get_sig = inspect.signature(http_get)
-
-                if 'disable_tqdm' in http_get_sig.parameters:
-                    http_get(
-                        url,
-                        temp_file=temp_file,
-                        proxies=proxies,
-                        resume_size=resume_size,
-                        headers=headers,
-                        cookies=cookies,
-                        max_retries=max_retries,
-                        desc=download_desc,
-                        disable_tqdm=disable_tqdm,
-                    )
-                else:
-                    http_get(
-                        url,
-                        temp_file=temp_file,
-                        proxies=proxies,
-                        resume_size=resume_size,
-                        headers=headers,
-                        cookies=cookies,
-                        max_retries=max_retries,
-                        desc=download_desc,
-                    )
+                http_get_ms(
+                    url,
+                    temp_file=temp_file,
+                    proxies=proxies,
+                    resume_size=resume_size,
+                    headers=headers,
+                    cookies=cookies,
+                    max_retries=max_retries,
+                    desc=download_desc,
+                )

        logger.info(f'storing {url} in cache at {cache_path}')
        shutil.move(temp_file.name, cache_path)
--- a/modelscope/preprocessors/templates/loader.py
+++ b/modelscope/preprocessors/templates/loader.py
@@ -83,7 +83,7 @@ template_info = [
    TemplateInfo(
        template=TemplateType.chatml,
        template_regex=
-        f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*{chat_suffix}.*',
+        f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*',
        modelfile_link=
        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-1.5.modelfile',
    ),
@@ -110,6 +110,10 @@ template_info = [
        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/glm4.modelfile',
    ),

+    TemplateInfo(
+        template_regex=f'.*{cases("llava-llama-3")}.*',
+        modelfile_link='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava-llama-3.modelfile'),
+
    # baichuan
    TemplateInfo(
        template=TemplateType.baichuan,
--- a/modelscope/utils/hf_util.py
+++ b/modelscope/utils/hf_util.py
@@ -127,7 +127,9 @@ def _patch_pretrained_class():
        @classmethod
        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
                            **kwargs):
-            ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
+            ignore_file_pattern = [
+                r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+            ]
            model_dir = get_model_dir(pretrained_model_name_or_path,
                                      ignore_file_pattern, **kwargs)
            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
@@ -143,14 +145,18 @@ def _patch_pretrained_class():
        @classmethod
        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
                            **kwargs):
-            ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
+            ignore_file_pattern = [
+                r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+            ]
            model_dir = get_model_dir(pretrained_model_name_or_path,
                                      ignore_file_pattern, **kwargs)
            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)

        @classmethod
        def get_config_dict(cls, pretrained_model_name_or_path, **kwargs):
-            ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
+            ignore_file_pattern = [
+                r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+            ]
            model_dir = get_model_dir(pretrained_model_name_or_path,
                                      ignore_file_pattern, **kwargs)
            return ori_get_config_dict(cls, model_dir, **kwargs)
@@ -242,11 +248,20 @@ AutoModelForTokenClassification = get_wrapped_class(
    AutoModelForTokenClassificationHF)

 AutoTokenizer = get_wrapped_class(
-    AutoTokenizerHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
+    AutoTokenizerHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+    ])
 AutoConfig = get_wrapped_class(
-    AutoConfigHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
+    AutoConfigHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+    ])
 GenerationConfig = get_wrapped_class(
-    GenerationConfigHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
+    GenerationConfigHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+    ])
 GPTQConfig = GPTQConfigHF
 AwqConfig = AwqConfigHF
 BitsAndBytesConfig = BitsAndBytesConfigHF
--- a/requirements/datasets.txt
+++ b/requirements/datasets.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=2.18.0,<3.0.0
+datasets>=3.0.0
 einops
 oss2
 Pillow
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=2.18.0,<3.0.0
+datasets>=3.0.0
 einops
 oss2
 Pillow
--- a/tests/msdatasets/test_stream_load.py
+++ b/tests/msdatasets/test_stream_load.py
@@ -44,6 +44,15 @@ class TestStreamLoad(unittest.TestCase):

        assert sample['question'], f'Failed to load sample from {repo_id}'

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_stream_swift_jsonl(self):
+        repo_id: str = 'iic/MSAgent-MultiRole'
+        ds = MsDataset.load(repo_id, split='train', use_streaming=True)
+        sample = next(iter(ds))
+        logger.info(sample)
+
+        assert sample['id'], f'Failed to load sample from {repo_id}'
+

 if __name__ == '__main__':
    unittest.main()
--- a/tests/tools/test_to_ollama.py
+++ b/tests/tools/test_to_ollama.py
@@ -100,6 +100,11 @@ class TestToOllama(unittest.TestCase):
        ollama = TemplateLoader.to_ollama(
            'QuantFactory/Mistral-Nemo-Japanese-Instruct-2408-GGUF')
        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama('AI-ModelScope/Yi-1.5-9B-32K-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'AI-ModelScope/llava-llama-3-8b-v1_1-gguf')
+        self.assertTrue(ollama is not None)


 if __name__ == '__main__':