diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh index d892f61c..10df299e 100644 --- a/.dev_scripts/dockerci.sh +++ b/.dev_scripts/dockerci.sh @@ -22,7 +22,7 @@ do echo "get gpu lock $gpu" CONTAINER_NAME="modelscope-ci-$idx" - let is_get_file_lock=true + is_get_file_lock=true # pull image if there are update docker pull ${IMAGE_NAME}:${IMAGE_VERSION} diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml index 8060f0bb..b3ae6816 100644 --- a/.github/workflows/citest.yaml +++ b/.github/workflows/citest.yaml @@ -54,7 +54,7 @@ jobs: - name: Checkout uses: actions/checkout@v3 with: - lfs: 'true' + lfs: 'false' submodules: 'true' fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }} - name: Get changed files @@ -65,8 +65,9 @@ jobs: else echo "PR_CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.event.after }} | xargs)" >> $GITHUB_ENV fi - - name: Checkout LFS objects - run: git lfs checkout + - name: Fetch LFS objects + run: | + git lfs pull - name: Run unittest shell: bash run: | diff --git a/.github/workflows/daily_regression.yaml b/.github/workflows/daily_regression.yaml index 85ca5e0b..bf73d3ea 100644 --- a/.github/workflows/daily_regression.yaml +++ b/.github/workflows/daily_regression.yaml @@ -24,12 +24,16 @@ jobs: sudo chown -R $USER:$USER $ACTION_RUNNER_DIR - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: - lfs: 'true' - submodules: 'true' - - name: Checkout LFS objects - run: git lfs checkout + lfs: false + submodules: true + fetch-depth: 0 + + - name: Fetch LFS objects + run: | + git lfs pull + - name: Run unittest shell: bash run: | diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index fa8e6b04..49972e1f 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -22,7 +22,9 @@ jobs: - name: Install wheel run: pip install wheel && pip install -r requirements/framework.txt - name: Build ModelScope - run: python setup.py sdist bdist_wheel + # Build AST template before packaging + run: | + make whl - name: Publish package to PyPI run: | pip install twine diff --git a/MANIFEST.in b/MANIFEST.in index 5e076f95..c9691801 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ -recursive-include modelscope/configs *.py *.cu *.h *.cpp +recursive-include modelscope/ops *.py *.cu *.h *.cpp recursive-include modelscope/cli/template *.tpl recursive-include modelscope/utils *.json diff --git a/Makefile b/Makefile index 96532199..92385886 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,8 @@ test: .PHONY: whl whl: - python setup.py sdist bdist_wheel + python -c "from modelscope.utils.ast_utils import generate_ast_template; generate_ast_template()" + python setup.py sdist --dist-dir $(WHL_BUILD_DIR)/dist bdist_wheel --dist-dir $(WHL_BUILD_DIR)/dist .PHONY: clean clean: diff --git a/modelscope/cli/cli.py b/modelscope/cli/cli.py index 139f14a5..acf9ad39 100644 --- a/modelscope/cli/cli.py +++ b/modelscope/cli/cli.py @@ -16,6 +16,7 @@ from modelscope.cli.server import ServerCMD from modelscope.cli.upload import UploadCMD from modelscope.hub.constants import MODELSCOPE_ASCII from modelscope.utils.logger import get_logger +from modelscope.version import __version__ logger = get_logger(log_level=logging.WARNING) @@ -24,6 +25,11 @@ def run_cmd(): print(MODELSCOPE_ASCII) parser = argparse.ArgumentParser( 'ModelScope Command Line tool', usage='modelscope []') + parser.add_argument( + '-V', + '--version', + action='version', + version=f'ModelScope CLI {__version__}') parser.add_argument( '--token', default=None, help='Specify ModelScope SDK token.') subparsers = parser.add_subparsers(help='modelscope commands helpers') diff --git a/modelscope/cli/create.py b/modelscope/cli/create.py index cb51e592..f9e9e48c 100644 --- a/modelscope/cli/create.py +++ b/modelscope/cli/create.py @@ -122,6 +122,13 @@ class CreateCMD(CLICommand): type=str, default='', help='Path in the repository to upload to.') + aigc_group.add_argument( + '--model_source', + type=str, + default='USER_UPLOAD', + help= + 'Source of the AIGC model. `USER_UPLOAD`, `TRAINED_FROM_MODELSCOPE` or `TRAINED_FROM_ALIYUN_FC`.' + ) parser.set_defaults(func=subparser_func) @@ -179,10 +186,11 @@ class CreateCMD(CLICommand): model_path=self.args.model_path, aigc_type=self.args.aigc_type, base_model_type=self.args.base_model_type, - revision=self.args.revision, + tag=self.args.revision, description=self.args.description, base_model_id=self.args.base_model_id, path_in_repo=self.args.path_in_repo, + model_source=self.args.model_source, ) # Convert visibility string to int for the API call diff --git a/modelscope/hub/__init__.py b/modelscope/hub/__init__.py index 36889641..7f788e61 100644 --- a/modelscope/hub/__init__.py +++ b/modelscope/hub/__init__.py @@ -1 +1,2 @@ from .callback import ProgressCallback +from .commit_scheduler import CommitScheduler diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 20adcf39..4cf0c02e 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -54,8 +54,8 @@ from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES, UPLOAD_MAX_FILE_SIZE, UPLOAD_NORMAL_FILE_SIZE_TOTAL_LIMIT, UPLOAD_SIZE_THRESHOLD_TO_ENFORCE_LFS, - DatasetVisibility, Licenses, - ModelVisibility, Visibility, + VALID_SORT_KEYS, DatasetVisibility, + Licenses, ModelVisibility, Visibility, VisibilityMap) from modelscope.hub.errors import (InvalidParameter, NotExistError, NotLoginException, RequestError, @@ -303,6 +303,7 @@ class HubApi: 'WeightsSize': aigc_model.weight_size, 'ModelPath': aigc_model.model_path, 'TriggerWords': aigc_model.trigger_words, + 'ModelSource': aigc_model.model_source, }) if aigc_model.official_tags: @@ -912,6 +913,75 @@ class HubApi: raise_for_http_status(r) return None + def list_datasets(self, + owner_or_group: str, + *, + page_number: Optional[int] = 1, + page_size: Optional[int] = 10, + sort: Optional[str] = None, + search: Optional[str] = None, + endpoint: Optional[str] = None, + ) -> dict: + """List datasets via OpenAPI with pagination, filtering and sorting. + + Args: + owner_or_group (str): Search by dataset authors (including organizations and individuals). + page_number (int, optional): The page number. Defaults to 1. + page_size (int, optional): The page size. Defaults to 10. + sort (str, optional): Sort key. If not provided, the server's default sorting is used. + choose from ['default', 'downloads', 'likes', 'last_modified']. + search (str, optional): Search by substring keywords in the dataset's Chinese name, + English name, and authors (including organizations and individuals). + endpoint (str, optional): Hub endpoint to use. When None, use the endpoint specified in the class. + + Returns: + dict: The OpenAPI data payload, e.g. + { + "datasets": [...], + "total_count": int, + "page_number": int, + "page_size": int + } + """ + if not endpoint: + endpoint = self.endpoint + path = f'{endpoint}/openapi/v1/datasets' + + # Build query params + params: Dict[str, Any] = { + 'page_number': page_number, + 'page_size': page_size, + } + if sort: + if sort not in VALID_SORT_KEYS: + raise InvalidParameter( + f'Invalid sort key: {sort}. Supported sort keys: {list(VALID_SORT_KEYS)}') + params['sort'] = sort + if search: + params['search'] = search + if owner_or_group: + params['author'] = owner_or_group + + cookies = ModelScopeConfig.get_cookies() + headers = self.builder_headers(self.headers) + + r = self.session.get( + path, + params=params, + cookies=cookies, + headers=headers + ) + raise_for_http_status(r) + resp = r.json() + + # OpenAPI success schema + if resp.get('success') is True and 'data' in resp: + return resp['data'] + else: + # Fallback for unexpected schema + msg = resp.get('message') or 'Failed to list datasets' + raise RequestError(msg) + def _check_cookie(self, use_cookies: Union[bool, CookieJar] = False) -> CookieJar: # noqa cookies = None if isinstance(use_cookies, CookieJar): @@ -1238,17 +1308,6 @@ class HubApi: logger.info(f'Create dataset success: {dataset_repo_url}') return dataset_repo_url - def list_datasets(self, endpoint: Optional[str] = None): - if not endpoint: - endpoint = self.endpoint - path = f'{endpoint}/api/v1/datasets' - params = {} - r = self.session.get(path, params=params, - headers=self.builder_headers(self.headers)) - raise_for_http_status(r) - dataset_list = r.json()[API_RESPONSE_FIELD_DATA] - return [x['Name'] for x in dataset_list] - def delete_dataset(self, dataset_id: str, endpoint: Optional[str] = None): cookies = ModelScopeConfig.get_cookies() @@ -1802,6 +1861,7 @@ class HubApi: user_name = os.environ[MODELSCOPE_CLOUD_USERNAME] download_uv_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/' \ f'{channel}?user={user_name}' + download_uv_resp = self.session.post(download_uv_url, cookies=cookies, headers=self.builder_headers(self.headers)) download_uv_resp = download_uv_resp.json() diff --git a/modelscope/hub/commit_scheduler.py b/modelscope/hub/commit_scheduler.py new file mode 100644 index 00000000..e924f7b4 --- /dev/null +++ b/modelscope/hub/commit_scheduler.py @@ -0,0 +1,381 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2022-present, the HuggingFace Inc. team. +import atexit +import contextlib +import os +import time +import types +from concurrent.futures import Future, ThreadPoolExecutor +from io import SEEK_END, SEEK_SET, BytesIO +from pathlib import Path +from threading import Lock, Thread +from typing import Dict, List, Optional, Union + +from modelscope.hub.api import HubApi +from modelscope.hub.constants import Visibility +from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION +from modelscope.utils.logger import get_logger +from modelscope.utils.repo_utils import CommitInfo, RepoUtils + +logger = get_logger() + +IGNORE_GIT_FOLDER_PATTERNS = ['.git', '.git/*', '*/.git', '**/.git/**'] + + +@contextlib.contextmanager +def patch_upload_folder_for_scheduler(scheduler_instance): + """Patch upload_folder for CommitScheduler""" + api = scheduler_instance.api + original_prepare = api._prepare_upload_folder + + def patched_prepare_upload_folder( + api_self, + folder_path_or_files: Union[str, Path, List[str], List[Path]], + path_in_repo: str, + allow_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, + ) -> List[Union[tuple, list]]: + """ + Patched version that supports incremental updates for CommitScheduler. + """ + with scheduler_instance.lock: + if isinstance(folder_path_or_files, list): + raise ValueError( + 'Uploading multiple files or folders is not supported for scheduled commit.' + ) + elif os.path.isfile(folder_path_or_files): + raise ValueError( + 'Uploading file is not supported for scheduled commit.') + else: + folder_path = Path(folder_path_or_files).expanduser().resolve() + + logger.debug('Listing files to upload for scheduled commit.') + relpath_to_abspath = { + path.relative_to(folder_path).as_posix(): path + for path in sorted(folder_path.glob('**/*')) if path.is_file() + } + prefix = f"{path_in_repo.strip('/')}/" if path_in_repo else '' + + prepared_repo_objects = [] + files_to_track = {} + + for relpath in RepoUtils.filter_repo_objects( + relpath_to_abspath.keys(), + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns): + local_path = relpath_to_abspath[relpath] + stat = local_path.stat() + if scheduler_instance.last_uploaded.get( + local_path + ) is None or scheduler_instance.last_uploaded[ + local_path] != stat.st_mtime: + partial_file = PartialFileIO(local_path, stat.st_size) + prepared_repo_objects.append( + (prefix + relpath, partial_file)) + files_to_track[local_path] = stat.st_mtime + + scheduler_instance._pending_tracker_updates = files_to_track + + if not prepared_repo_objects: + logger.debug( + 'No changed files to upload for scheduled commit.') + + return prepared_repo_objects + + try: + api._prepare_upload_folder = types.MethodType( + patched_prepare_upload_folder, api) + yield + finally: + api._prepare_upload_folder = original_prepare + + +class CommitScheduler: + """ + A scheduler that automatically uploads a local folder to ModelScope Hub at + specified intervals (e.g., every 5 minutes). + + It's recommended to use the scheduler as a context manager to ensure proper + cleanup and final commit execution when your script completes. Alternatively, + you can manually stop the scheduler using the `stop` method. + + Args: + repo_id (`str`): + The id of the repo to commit to. + folder_path (`str` or `Path`): + Local folder path that will be monitored and uploaded periodically. + interval (`int` or `float`, *optional*): + Time interval in minutes between each upload operation. Defaults to 5 minutes. + path_in_repo (`str`, *optional*): + Target directory path within the repository, such as `"models/"`. + If not specified, files are uploaded to the repository root. + repo_type (`str`, *optional*): + Repository type for the target repo. Defaults to `model`. + revision (`str`, *optional*): + Target branch or revision for commits. Defaults to `master`. + visibility (`str`, *optional*): + The visibility of the repo, + could be `public`, `private`, `internal`, default to `public`. + token (`str`, *optional*): + The token to use to commit to the repo. Defaults to the token saved on the machine. + allow_patterns (`List[str]` or `str`, *optional*): + File patterns to include in uploads. Only files matching these patterns will be uploaded. + ignore_patterns (`List[str]` or `str`, *optional*): + File patterns to exclude from uploads. Files matching these patterns will be skipped. + hub_api (`HubApi`, *optional*): + Custom [`HubApi`] instance for Hub operations. Allows for customized + configurations like user agent or token settings. + + Example: + ```py + >>> from pathlib import Path + >>> from modelscope.hub import CommitScheduler + + # Create scheduler with 10-minute intervals + >>> data_file = Path("workspace/experiment.log") + >>> scheduler = CommitScheduler( + ... repo_id="my_experiments", + ... repo_type="dataset", + ... folder_path=data_file.parent, + ... interval=10 + ... ) + + >>> with data_file.open("a") as f: + ... f.write("experiment started") + + # Later in the workflow... + >>> with data_file.open("a") as f: + ... f.write("experiment completed") + ``` + + Context manager usage: + ```py + >>> from pathlib import Path + >>> from modelscope.hub import CommitScheduler + + >>> with CommitScheduler( + ... repo_id="my_experiments", + ... repo_type="dataset", + ... folder_path="workspace", + ... interval=10 + ... ) as scheduler: + ... log_file = Path("workspace/progress.log") + ... with log_file.open("a") as f: + ... f.write("starting process") + ... # ... perform work ... + ... with log_file.open("a") as f: + ... f.write("process finished") + + # Scheduler automatically stops and performs final upload + ``` + """ + + def __init__( + self, + *, + repo_id: str, + folder_path: Union[str, Path], + interval: Union[int, float] = 5, + path_in_repo: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = DEFAULT_REPOSITORY_REVISION, + visibility: Optional[str] = Visibility.PUBLIC, + token: Optional[str] = None, + allow_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, + hub_api: Optional[HubApi] = None, + ) -> None: + self.api = hub_api or HubApi() + + self.folder_path = Path(folder_path).expanduser().resolve() + if not self.folder_path.exists(): + raise ValueError(f'Folder path does not exist: {folder_path}') + + self.path_in_repo = path_in_repo or '' + self.allow_patterns = allow_patterns + + if ignore_patterns is None: + ignore_patterns = [] + elif isinstance(ignore_patterns, str): + ignore_patterns = [ignore_patterns] + self.ignore_patterns = ignore_patterns + IGNORE_GIT_FOLDER_PATTERNS + + self.repo_url = self.api.create_repo( + repo_id=repo_id, + token=token, + repo_type=repo_type, + visibility=visibility, + exist_ok=True, + create_default_config=False, + ) + self.repo_id = repo_id + self.repo_type = repo_type + self.revision = revision + self.token = token + + # Keep track of already uploaded files + self.last_uploaded: Dict[Path, float] = {} + + if interval <= 0: + raise ValueError( + f'"interval" must be a positive integer, not "{interval}".') + self.lock = Lock() + self.interval = interval + self.__stopped = False + + logger.info( + f'Scheduled job to push {self.folder_path} to {self.repo_id} at an interval of {self.interval} minutes.' + ) + self.executor = ThreadPoolExecutor(max_workers=1) + self._scheduler_thread = Thread( + target=self._run_scheduler, daemon=True) + self._scheduler_thread.start() + atexit.register(self.commit_scheduled_changes) + + def stop(self) -> None: + """Stop the scheduler.""" + self.__stopped = True + + def __enter__(self) -> 'CommitScheduler': + return self + + def __exit__(self, exc_type, exc_value, traceback) -> None: + self.trigger().result() + self.stop() + return + + def _run_scheduler(self) -> None: + while not self.__stopped: + self.last_future = self.trigger() + time.sleep(self.interval * 60) + + def trigger(self) -> Future: + """Trigger a background commit and return a future.""" + return self.executor.submit(self._commit_scheduled_changes) + + def _commit_scheduled_changes(self) -> Optional[CommitInfo]: + if self.__stopped: + return None + + logger.info('(Background) scheduled commit triggered.') + try: + value = self.commit_scheduled_changes() + return value + except Exception as e: + logger.error(f'Error while pushing to Hub: {e}') + raise + + def commit_scheduled_changes(self) -> Optional[CommitInfo]: + """Push folder to the Hub and return commit info if changes are found.""" + try: + self._pending_tracker_updates = {} + with patch_upload_folder_for_scheduler(self): + commit_info = self.api.upload_folder( + repo_id=self.repo_id, + folder_path=self.folder_path, + path_in_repo=self.path_in_repo, + commit_message='Scheduled Commit', + token=self.token, + repo_type=self.repo_type, + allow_patterns=self.allow_patterns, + ignore_patterns=self.ignore_patterns, + revision=self.revision, + ) + + if commit_info is None: + logger.debug( + 'No changed files to upload for scheduled commit.') + return None + + with self.lock: + if hasattr(self, '_pending_tracker_updates'): + self.last_uploaded.update(self._pending_tracker_updates) + logger.debug( + f'Updated modification tracker for {len(self._pending_tracker_updates)} files.' + ) + del self._pending_tracker_updates + + return commit_info + + except Exception as e: + # Treat "No files to upload" as a normal ― no-change ― situation instead of an error. + if 'No files to upload' in str(e): + logger.debug( + 'No changed files to upload for scheduled commit.') + return None + + if hasattr(self, '_pending_tracker_updates'): + del self._pending_tracker_updates + logger.error(f'Error during scheduled commit: {e}') + raise + + +class PartialFileIO(BytesIO): + """A file-like object that reads only the first part of a file.""" + + def __init__(self, file_path: Union[str, Path], size_limit: int) -> None: + self._file_path = Path(file_path) + self._file = None + self._size_limit = size_limit + self.open() + + def open(self) -> None: + """Open the file and initialize size limit.""" + if self._file is not None: + return + try: + self._file = self._file_path.open('rb') + self._size_limit = min( + self._size_limit or float('inf'), + os.fstat(self._file.fileno()).st_size) + except OSError as e: + logger.error(f'Failed to open file {self._file_path}: {e}') + raise + + def close(self) -> None: + """Close the file if it's open.""" + if self._file is not None: + self._file.close() + self._file = None + + def __del__(self) -> None: + self.close() + return super().__del__() + + def __repr__(self) -> str: + return f'' + + def __len__(self) -> int: + return self._size_limit + + def __getattribute__(self, name: str): + if name.startswith('_') or name in { + 'read', 'tell', 'seek', 'close', 'open' + }: # only 5 public methods supported + return super().__getattribute__(name) + raise NotImplementedError(f"PartialFileIO does not support '{name}'.") + + def tell(self) -> int: + return self._file.tell() + + def seek(self, __offset: int, __whence: int = SEEK_SET) -> int: + """Seek to a position in the file, but never beyond size_limit.""" + if __whence == SEEK_END: + __offset = len(self) + __offset + __whence = SEEK_SET + + pos = self._file.seek(__offset, __whence) + if pos > self._size_limit: + return self._file.seek(self._size_limit) + return pos + + def read(self, __size: Optional[int] = -1) -> bytes: + """Read at most _size bytes from the current position.""" + current = self.tell() + if __size is None or __size < 0: + # Read until file limit + truncated_size = self._size_limit - current + else: + # Read until file limit or __size + truncated_size = min(__size, self._size_limit - current) + return self._file.read(truncated_size) diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py index d08dfef7..4cec4282 100644 --- a/modelscope/hub/constants.py +++ b/modelscope/hub/constants.py @@ -116,3 +116,18 @@ VisibilityMap = { ModelVisibility.INTERNAL: Visibility.INTERNAL, ModelVisibility.PUBLIC: Visibility.PUBLIC } + + +class SortKey(object): + DEFAULT = 'default' + DOWNLOADS = 'downloads' + LIKES = 'likes' + LAST_MODIFIED = 'last_modified' + + +VALID_SORT_KEYS = { + SortKey.DEFAULT, + SortKey.DOWNLOADS, + SortKey.LIKES, + SortKey.LAST_MODIFIED, +} diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py index 2f63edd9..64e4e0de 100644 --- a/modelscope/hub/errors.py +++ b/modelscope/hub/errors.py @@ -120,11 +120,12 @@ def handle_http_response(response: requests.Response, http_error_msg = 'The request model: %s does not exist!' % (model_id) elif 403 == response.status_code: if cookies is None: - http_error_msg = 'Authentication token does not exist, ' - 'failed to access model {model_id} which may not exist or may be ' - 'private. Please login first.' + http_error_msg = f'Authentication token does not exist, \ + failed to access model {model_id} which may not exist \ + or may be private. Please login first.' + else: - http_error_msg = 'The authentication token is invalid, failed to access model {model_id}.' + http_error_msg = f'The authentication token is invalid, failed to access model {model_id}.' elif 400 <= response.status_code < 500: http_error_msg = u'%s Client Error: %s, Request id: %s for url: %s' % ( response.status_code, reason, request_id, response.url) diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index eeb0d414..2906450b 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -25,6 +25,7 @@ from modelscope.hub.constants import ( MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB, TEMPORARY_FOLDER_NAME) from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DEFAULT_MODEL_REVISION, + INTRA_CLOUD_ACCELERATION, REPO_TYPE_DATASET, REPO_TYPE_MODEL, REPO_TYPE_SUPPORT) from modelscope.utils.file_utils import (get_dataset_cache_root, @@ -194,9 +195,22 @@ def _repo_file_download( " online, set 'local_files_only' to False.") _api = HubApi() + headers = { - 'user-agent': ModelScopeConfig.get_user_agent(user_agent=user_agent, ) + 'user-agent': ModelScopeConfig.get_user_agent(user_agent=user_agent, ), + 'snapshot-identifier': str(uuid.uuid4()), } + + if INTRA_CLOUD_ACCELERATION == 'true': + region_id: str = ( + os.getenv('INTRA_CLOUD_ACCELERATION_REGION') + or _api._get_internal_acceleration_domain()) + if region_id: + logger.info( + f'Intra-cloud acceleration enabled for downloading from {repo_id}' + ) + headers['x-aliyun-region-id'] = region_id + if cookies is None: cookies = ModelScopeConfig.get_cookies() repo_files = [] diff --git a/modelscope/hub/utils/aigc.py b/modelscope/hub/utils/aigc.py index 30593d10..62df6cf8 100644 --- a/modelscope/hub/utils/aigc.py +++ b/modelscope/hub/utils/aigc.py @@ -8,7 +8,7 @@ import requests from tqdm.auto import tqdm from modelscope.hub.utils.utils import (MODELSCOPE_URL_SCHEME, - encode_image_to_base64, get_endpoint) + encode_media_to_base64, get_endpoint) from modelscope.utils.logger import get_logger logger = get_logger() @@ -69,17 +69,20 @@ class AigcModel: 'main-strong', 'character-strong' } - def __init__(self, - aigc_type: str, - base_model_type: str, - model_path: str, - base_model_id: str = '', - tag: Optional[str] = 'v1.0', - description: Optional[str] = 'this is an aigc model', - cover_images: Optional[List[str]] = None, - path_in_repo: Optional[str] = '', - trigger_words: Optional[List[str]] = None, - official_tags: Optional[List[str]] = None): + def __init__( + self, + aigc_type: str, + base_model_type: str, + model_path: str, + base_model_id: str = '', + tag: Optional[str] = 'v1.0', + description: Optional[str] = 'this is an aigc model', + cover_images: Optional[List[str]] = None, + path_in_repo: Optional[str] = '', + trigger_words: Optional[List[str]] = None, + official_tags: Optional[List[str]] = None, + model_source: Optional[str] = 'USER_UPLOAD', + ): """ Initializes the AigcModel helper. @@ -94,12 +97,15 @@ class AigcModel: path_in_repo (str, optional): Path in the repository. trigger_words (List[str], optional): Trigger words for the AIGC Lora model. official_tags (List[str], optional): Official tags for the AIGC model. Defaults to None. + model_source (str, optional): Source of the model. + `USER_UPLOAD`, `TRAINED_FROM_MODELSCOPE` or `TRAINED_FROM_ALIYUN_FC`. Defaults to 'USER_UPLOAD'. """ self.model_path = model_path self.aigc_type = aigc_type self.base_model_type = base_model_type self.tag = tag self.description = description + self.model_source = model_source # Process cover images - convert local paths to base64 data URLs if cover_images is not None: processed_cover_images = [] @@ -111,7 +117,7 @@ class AigcModel: or img.startswith('data:')): try: # Convert local path to base64 data URL - processed_img = encode_image_to_base64(img) + processed_img = encode_media_to_base64(img) processed_cover_images.append(processed_img) logger.info('Converted local image to base64: %s', os.path.basename(img)) @@ -383,7 +389,8 @@ class AigcModel: 'weight_sha256': self.weight_sha256, 'weight_size': self.weight_size, 'trigger_words': self.trigger_words, - 'official_tags': self.official_tags + 'official_tags': self.official_tags, + 'model_source': self.model_source, } @classmethod diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index 35c562e1..161f866b 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -146,16 +146,15 @@ def compute_hash(file_path): return sha256_hash.hexdigest() -def file_integrity_validation(file_path, expected_sha256): +def file_integrity_validation(file_path, expected_sha256) -> bool: """Validate the file hash is expected, if not, delete the file Args: file_path (str): The file to validate expected_sha256 (str): The expected sha256 hash - Raises: - FileIntegrityError: If file_path hash is not expected. - + Returns: + bool: True if the file is valid, False otherwise """ file_sha256 = compute_hash(file_path) if not file_sha256 == expected_sha256: @@ -163,7 +162,9 @@ def file_integrity_validation(file_path, expected_sha256): msg = 'File %s integrity check failed, expected sha256 signature is %s, actual is %s, the download may be incomplete, please try again.' % ( # noqa E501 file_path, expected_sha256, file_sha256) logger.error(msg) - raise FileIntegrityError(msg) + return False + + return True def add_content_to_file(repo, @@ -380,40 +381,40 @@ def convert_timestamp(time_stamp: Union[int, str, datetime], ) -def encode_image_to_base64(image_path: str) -> str: +def encode_media_to_base64(media_file_path: str) -> str: """ - Encode image file to base64 string. + Encode image or video file to base64 string. Args: - image_path (str): Path to the image file + media_file_path (str): Path to the image or video file Returns: str: Base64 encoded string with data URL prefix Raises: - FileNotFoundError: If image file doesn't exist - ValueError: If file is not a valid image format + FileNotFoundError: If image/video file doesn't exist + ValueError: If file is not a valid format """ import base64 import mimetypes # Expand user path - image_path = os.path.expanduser(image_path) + media_file_path = os.path.expanduser(media_file_path) - if not os.path.exists(image_path): - raise FileNotFoundError(f'Image file not found: {image_path}') + if not os.path.exists(media_file_path): + raise FileNotFoundError(f'Image file not found: {media_file_path}') - if not os.path.isfile(image_path): - raise ValueError(f'Path is not a file: {image_path}') + if not os.path.isfile(media_file_path): + raise ValueError(f'Path is not a file: {media_file_path}') # Get MIME type - mime_type, _ = mimetypes.guess_type(image_path) - if not mime_type or not mime_type.startswith('image/'): - raise ValueError(f'File is not a valid image format: {image_path}') + mime_type, _ = mimetypes.guess_type(media_file_path) + if not mime_type: + raise ValueError(f'File is not a valid format: {media_file_path}') # Read and encode file - with open(image_path, 'rb') as image_file: - image_data = image_file.read() + with open(media_file_path, 'rb') as media_file: + image_data = media_file.read() base64_data = base64.b64encode(image_data).decode('utf-8') # Return data URL format diff --git a/modelscope/msdatasets.lock b/modelscope/msdatasets.lock new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index f867081e..e275fc87 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -741,7 +741,8 @@ def _download_additional_modules( namespace: str, revision: str, imports: Tuple[str, str, str, str], - download_config: Optional[DownloadConfig] + download_config: Optional[DownloadConfig], + trust_remote_code: Optional[bool] = False, ) -> List[Tuple[str, str]]: """ Download additional module for a module .py at URL (or local path) /.py @@ -755,6 +756,21 @@ def _download_additional_modules( """ local_imports = [] library_imports = [] + + # Check if we need to execute remote code + has_remote_code = any( + import_type in ('internal', 'external') + for import_type, _, _, _ in imports + ) + + if has_remote_code and not trust_remote_code: + raise ValueError( + f'Loading {name} requires executing code from the repository. ' + 'This is disabled by default for security reasons. ' + 'If you trust the authors of this dataset, you can enable it with ' + '`trust_remote_code=True`.' + ) + download_config = download_config.copy() if download_config.download_desc is None: download_config.download_desc = 'Downloading extra modules' @@ -863,6 +879,7 @@ def get_module_with_script(self) -> DatasetModule: revision=revision, imports=imports, download_config=self.download_config, + trust_remote_code=self.trust_remote_code, ) additional_files = [] if dataset_infos_path: diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py index f9c8f944..d4dca7bb 100644 --- a/modelscope/preprocessors/templates/loader.py +++ b/modelscope/preprocessors/templates/loader.py @@ -194,6 +194,10 @@ template_info = [ modelfile_prefix= 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek-llm', ), + TemplateInfo( + template_regex=f'.*{cases("DeepSeek-V3.1")}.*', + modelfile_prefix= + 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek-v3.1'), TemplateInfo( template_regex= f'.*{cases("deepseek")}.*{cases("v3")}.*', @@ -994,6 +998,14 @@ template_info = [ template_regex=f'.*{cases("deepscaler")}.*', modelfile_prefix= 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepscaler'), + TemplateInfo( + template_regex=f'.*{cases("granite-4.0")}.*', + modelfile_prefix= + 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite4'), + TemplateInfo( + template_regex=f'.*{cases("gpt-oss")}.*', + modelfile_prefix= + 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/gpt-oss'), ] diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py index 657ebb33..ec7a8457 100644 --- a/modelscope/utils/ast_utils.py +++ b/modelscope/utils/ast_utils.py @@ -768,7 +768,7 @@ def load_from_prebuilt(file_path=None): if os.path.exists(file_path): index = _load_index(file_path, with_template=True) else: - index = None + index = generate_ast_template() return index diff --git a/modelscope/utils/file_utils.py b/modelscope/utils/file_utils.py index 9414e2ce..7afea306 100644 --- a/modelscope/utils/file_utils.py +++ b/modelscope/utils/file_utils.py @@ -257,6 +257,7 @@ def get_file_hash( progress.update(len(byte_chunk)) file_hash = file_hash.hexdigest() final_chunk_size = buffer_size + file_path_or_obj.seek(0, os.SEEK_SET) else: progress.close() diff --git a/modelscope/utils/repo_utils.py b/modelscope/utils/repo_utils.py index 55f01f37..f8f5cf71 100644 --- a/modelscope/utils/repo_utils.py +++ b/modelscope/utils/repo_utils.py @@ -418,6 +418,7 @@ class UploadInfo: file_hash_info: dict = file_hash_info or get_file_hash(fileobj) fileobj.seek(0, os.SEEK_SET) sample = fileobj.read(512) + fileobj.seek(0, os.SEEK_SET) return cls( sha256=file_hash_info['file_hash'], size=file_hash_info['file_size'], diff --git a/modelscope/version.py b/modelscope/version.py index bf05e94b..5090d6bd 100644 --- a/modelscope/version.py +++ b/modelscope/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '1.31.0' +__version__ = '1.32.0' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future -__release_datetime__ = '2025-10-10 23:59:59' +__release_datetime__ = '2025-11-07 12:00:00' diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..3b95faac --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,81 @@ +[project] +name = "modelscope" +dynamic = ["version", "dependencies", "optional-dependencies"] +description = "ModelScope: bring the notion of Model-as-a-Service to life." +readme = {file = "README.md", content-type = "text/markdown"} +license = "Apache-2.0" +license-files = ["LICENSE"] +authors = [ + {name = "ModelScope team"}, + {email = "contact@modelscope.cn"} +] +keywords = ["python", "nlp", "science", "cv", "speech", "multi-modal"] +requires-python = ">=3.9" +classifiers = [ + 'Development Status :: 4 - Beta', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', +] + +[project.urls] +Homepage = "https://github.com/modelscope/modelscope" + +[project.scripts] +modelscope = "modelscope.cli.cli:run_cmd" + +[build-system] +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.packages.find] +where = ["."] +include = ["modelscope*"] + +[tool.setuptools.dynamic] +version = {attr = "modelscope.version.__version__"} +dependencies = {file = ["requirements/hub.txt"]} + +[tool.setuptools.dynamic.optional-dependencies] +hub = {file = ["requirements/hub.txt"]} +datasets = {file = ["requirements/datasets.txt"]} +framework = {file = ["requirements/framework.txt"]} +server = {file = ["requirements/server.txt"]} +docs = {file = ["requirements/docs.txt"]} +tests = {file = ["requirements/tests.txt"]} + +# domain specific with framework base +cv = {file = ["requirements/framework.txt", "requirements/cv.txt"]} +nlp = {file = ["requirements/framework.txt", "requirements/nlp.txt"]} +multi-modal = {file = ["requirements/framework.txt", "requirements/multi-modal.txt"]} +science = {file = ["requirements/framework.txt", "requirements/science.txt"]} + +# audio specific with framework base +audio_asr = {file = ["requirements/framework.txt", "requirements/audio/audio_asr.txt"]} +audio_codec = {file = ["requirements/framework.txt", "requirements/audio/audio_codec.txt"]} +audio_tts = {file = ["requirements/framework.txt", "requirements/audio/audio_tts.txt"]} +audio_kws = {file = ["requirements/framework.txt", "requirements/audio/audio_kws.txt"]} +audio_signal = {file = ["requirements/framework.txt", "requirements/audio/audio_signal.txt"]} +audio = {file = ["requirements/framework.txt", + "requirements/audio/audio_asr.txt", + "requirements/audio/audio_codec.txt", + "requirements/audio/audio_tts.txt", + "requirements/audio/audio_kws.txt", + "requirements/audio/audio_signal.txt"]} + +# skip audio requirements due to its hard dependency which may cause installation failure +all = {file = [ + "requirements/hub.txt", + "requirements/datasets.txt", + "requirements/framework.txt", + "requirements/cv.txt", + "requirements/nlp.txt", + "requirements/multi-modal.txt", + "requirements/science.txt", + "requirements/server.txt", +]} diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt index b2f767c5..46ac0054 100644 --- a/requirements/multi-modal.txt +++ b/requirements/multi-modal.txt @@ -2,11 +2,11 @@ accelerate cloudpickle decord>=0.6.0 diffusers>=0.25.0 -ftfy>=6.0.3 # 0.12.1 has issue of No such file or directory: 'fairseq/version.txt' # 0.12.2 not support py311 #fairseq==0.12.2 -https://github.com/liyaodev/fairseq/releases/download/v0.12.3.1/fairseq-0.12.3.1-cp311-cp311-linux_x86_64.whl +fairseq-fixed==0.12.3.1 +ftfy>=6.0.3 librosa==0.10.1 opencv-python pycocoevalcap>=1.2 diff --git a/setup.py b/setup.py index 76a6a7bf..70fb2673 100644 --- a/setup.py +++ b/setup.py @@ -1,239 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -# !/usr/bin/env python -import os -import shutil -import subprocess -from setuptools import find_packages, setup +# -*- coding: utf-8 -*- -from modelscope.utils.constant import Fields +from setuptools import setup - -def readme(): - with open('README.md', encoding='utf-8') as f: - content = f.read() - return content - - -version_file = 'modelscope/version.py' - - -def get_git_hash(): - - def _minimal_ext_cmd(cmd): - # construct minimal environment - env = {} - for k in ['SYSTEMROOT', 'PATH', 'HOME']: - v = os.environ.get(k) - if v is not None: - env[k] = v - # LANGUAGE is used on win32 - env['LANGUAGE'] = 'C' - env['LANG'] = 'C' - env['LC_ALL'] = 'C' - out = subprocess.Popen( - cmd, stdout=subprocess.PIPE, env=env).communicate()[0] - return out - - try: - out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) - sha = out.strip().decode('ascii') - except OSError: - sha = 'unknown' - - return sha - - -def get_hash(): - assert os.path.exists('.git'), '.git directory does not exist' - sha = get_git_hash()[:7] - return sha - - -def get_version(): - with open(version_file, 'r', encoding='utf-8') as f: - exec(compile(f.read(), version_file, 'exec')) - return locals()['__version__'] - - -def parse_requirements(fname='requirements.txt', with_version=True): - """ - Parse the package dependencies listed in a requirements file but strips - specific versioning information. - - Args: - fname (str): path to requirements file - with_version (bool, default=False): if True include version specs - - Returns: - List[str]: list of requirements items - - CommandLine: - python -c "import setup; print(setup.parse_requirements())" - """ - import re - import sys - from os.path import exists - require_fpath = fname - - def parse_line(line): - """ - Parse information from a line in a requirements text file - """ - if line.startswith('-r '): - # Allow specifying requirements in other files - target = line.split(' ')[1] - relative_base = os.path.dirname(fname) - absolute_target = os.path.join(relative_base, target) - for info in parse_require_file(absolute_target): - yield info - else: - info = {'line': line} - if line.startswith('-e '): - info['package'] = line.split('#egg=')[1] - else: - # Remove versioning from the package - pat = '(' + '|'.join(['>=', '==', '>']) + ')' - parts = re.split(pat, line, maxsplit=1) - parts = [p.strip() for p in parts] - - info['package'] = parts[0] - if len(parts) > 1: - op, rest = parts[1:] - if ';' in rest: - # Handle platform specific dependencies - # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies - version, platform_deps = map(str.strip, - rest.split(';')) - info['platform_deps'] = platform_deps - else: - version = rest # NOQA - info['version'] = (op, version) - yield info - - def parse_require_file(fpath): - with open(fpath, 'r', encoding='utf-8') as f: - for line in f.readlines(): - line = line.strip() - if line.startswith('http'): - print('skip http requirements %s' % line) - continue - if line and not line.startswith('#') and not line.startswith( - '--'): - for info in parse_line(line): - yield info - elif line and line.startswith('--find-links'): - eles = line.split() - for e in eles: - e = e.strip() - if 'http' in e: - info = dict(dependency_links=e) - yield info - - def gen_packages_items(): - items = [] - deps_link = [] - if exists(require_fpath): - for info in parse_require_file(require_fpath): - if 'dependency_links' not in info: - parts = [info['package']] - if with_version and 'version' in info: - parts.extend(info['version']) - if not sys.version.startswith('3.4'): - # apparently package_deps are broken in 3.4 - platform_deps = info.get('platform_deps') - if platform_deps is not None: - parts.append(';' + platform_deps) - item = ''.join(parts) - items.append(item) - else: - deps_link.append(info['dependency_links']) - return items, deps_link - - return gen_packages_items() - - -def pack_resource(): - # pack resource such as configs and tools - root_dir = 'package/' - if os.path.isdir(root_dir): - shutil.rmtree(root_dir) - os.makedirs(root_dir) - - proj_dir = root_dir + 'modelscope/' - shutil.copytree('./modelscope', proj_dir) - shutil.copytree('./configs', proj_dir + 'configs') - shutil.copytree('./requirements', 'package/requirements') - shutil.copy('./requirements.txt', 'package/requirements.txt') - shutil.copy('./MANIFEST.in', 'package/MANIFEST.in') - shutil.copy('./README.md', 'package/README.md') - - -if __name__ == '__main__': - # write_version_py() - from modelscope.utils.ast_utils import generate_ast_template - generate_ast_template() - pack_resource() - os.chdir('package') - install_requires, deps_link = parse_requirements('requirements.txt') - extra_requires = {} - all_requires = [] - for field in dir(Fields): - if field.startswith('_'): - continue - field = getattr(Fields, field) - extra_requires[field], _ = parse_requirements( - f'requirements/{field}.txt') - - # skip audio requirements due to its hard dependency which - # result in mac/windows compatibility problems - if field != Fields.audio: - all_requires.append(extra_requires[field]) - for subfiled in ['asr', 'kws', 'signal', 'tts']: - filed_name = f'audio_{subfiled}' - extra_requires[filed_name], _ = parse_requirements( - f'requirements/audio/{filed_name}.txt') - framework_requires = extra_requires['framework'] - # add framework dependencies to every field - for field, requires in extra_requires.items(): - if field not in [ - 'server', 'framework', 'hub', 'datasets' - ]: # server need install model's field dependencies before. - extra_requires[field] = framework_requires + extra_requires[field] - extra_requires['all'] = all_requires - - setup( - name='modelscope', - version=get_version(), - description= - 'ModelScope: bring the notion of Model-as-a-Service to life.', - long_description=readme(), - long_description_content_type='text/markdown', - author='ModelScope team', - author_email='contact@modelscope.cn', - keywords='python,nlp,science,cv,speech,multi-modal', - url='https://github.com/modelscope/modelscope', - packages=find_packages(exclude=('configs', 'demo')), - include_package_data=True, - package_data={ - '': ['*.h', '*.cpp', '*.cu'], - }, - classifiers=[ - 'Development Status :: 4 - Beta', - 'License :: OSI Approved :: Apache Software License', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - ], - license='Apache License 2.0', - tests_require=parse_requirements('requirements/tests.txt'), - install_requires=install_requires, - extras_require=extra_requires, - entry_points={ - 'console_scripts': ['modelscope=modelscope.cli.cli:run_cmd'] - }, - dependency_links=deps_link, - zip_safe=False) +setup() diff --git a/tests/hub/test_commit_scheduler.py b/tests/hub/test_commit_scheduler.py new file mode 100644 index 00000000..d746c330 --- /dev/null +++ b/tests/hub/test_commit_scheduler.py @@ -0,0 +1,583 @@ +import atexit +import shutil +import tempfile +import time +import unittest +import uuid +from io import SEEK_END +from pathlib import Path +from unittest.mock import MagicMock, patch + +from modelscope.hub.api import HubApi +from modelscope.hub.commit_scheduler import CommitScheduler, PartialFileIO +from modelscope.hub.constants import Visibility +from modelscope.hub.errors import NotExistError +from modelscope.hub.file_download import _repo_file_download +from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION +from modelscope.utils.repo_utils import CommitInfo, CommitOperationAdd +from modelscope.utils.test_utils import (TEST_ACCESS_TOKEN1, TEST_MODEL_ORG, + delete_credential, test_level) + + +class TestCommitScheduler(unittest.TestCase): + """Test suite for ModelScope CommitScheduler functionality.""" + + def setUp(self) -> None: + """Set up test environment with temporary directories and mock API.""" + self.api = HubApi() + self.repo_name = f'test-commit-scheduler-{uuid.uuid4().hex[:8]}' + self.repo_id = f'{TEST_MODEL_ORG}/{self.repo_name}' + + # Create temporary cache directory + self.cache_dir = Path(tempfile.mkdtemp()) + self.watched_folder = self.cache_dir / 'watched_folder' + self.watched_folder.mkdir(exist_ok=True, parents=True) + + # Initialize scheduler reference for cleanup + self.scheduler = None + + def tearDown(self) -> None: + """Clean up test resources.""" + + def cleanup(scheduler): + try: + scheduler.stop() + remove_atexit_callback(scheduler) + except Exception as e: + print( + f'Warning: Could not clean up scheduler {scheduler.repo_id}: {e}' + ) + + def remove_atexit_callback(scheduler): + try: + atexit.unregister(scheduler.commit_scheduled_changes) + except Exception as e: + print( + f'Warning: Could not remove atexit callback for scheduler {scheduler.repo_id}: {e}' + ) + + # Stop scheduler if it exists + if self.scheduler is not None: + try: + cleanup(self.scheduler) + except Exception: + pass + + # Try to delete test repo (may not exist for mocked tests) + try: + if hasattr(self, 'api') and TEST_ACCESS_TOKEN1: + self.api.login(TEST_ACCESS_TOKEN1) + self.api.delete_repo(repo_id=self.repo_id, repo_type='dataset') + except Exception: + pass + + # Clean up temporary directories + if self.cache_dir.exists(): + shutil.rmtree(self.cache_dir, ignore_errors=True) + + delete_credential() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_invalid_folder_path_nonexistent(self) -> None: + """Test that CommitScheduler raises error for non-existent folder.""" + nonexistent_path = self.cache_dir / 'nonexistent' + + with self.assertRaises(ValueError) as cm: + CommitScheduler( + repo_id=self.repo_id, + folder_path=nonexistent_path, + interval=1, + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + self.assertIn('does not exist', str(cm.exception)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_invalid_interval(self) -> None: + """Test that CommitScheduler raises error for invalid interval values.""" + # Test zero interval + with self.assertRaises(ValueError) as cm: + CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + interval=0, + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + self.assertIn('positive', str(cm.exception)) + + # Test negative interval + with self.assertRaises(ValueError) as cm: + CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + interval=-1, + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + self.assertIn('positive', str(cm.exception)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_initialization_with_defaults(self) -> None: + """Test CommitScheduler initialization with default parameters.""" + with patch.object(HubApi, 'create_repo') as mock_create: + mock_create.return_value = f'https://modelscope.cn/datasets/{self.repo_id}' + + self.scheduler = CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + + # Check default values + self.assertEqual(self.scheduler.repo_id, self.repo_id) + self.assertEqual(self.scheduler.folder_path, + self.watched_folder.resolve()) + self.assertEqual(self.scheduler.interval, 5) # default 5 minutes + self.assertEqual(self.scheduler.path_in_repo, '') + self.assertEqual(self.scheduler.revision, + DEFAULT_REPOSITORY_REVISION) + self.assertIsInstance(self.scheduler.last_uploaded, dict) + + # Verify create_repo was called + mock_create.assert_called_once() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_initialization_with_custom_parameters(self) -> None: + """Test CommitScheduler initialization with custom parameters.""" + custom_interval = 10 + custom_path_in_repo = 'custom/path' + custom_revision = 'develop' + allow_patterns = ['*.txt', '*.json'] + ignore_patterns = ['*.log', 'temp/*'] + + with patch.object(HubApi, 'create_repo') as mock_create: + mock_create.return_value = f'https://modelscope.cn/datasets/{self.repo_id}' + + self.scheduler = CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + interval=custom_interval, + path_in_repo=custom_path_in_repo, + revision=custom_revision, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + visibility=Visibility.PUBLIC, + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + + # Check custom values + self.assertEqual(self.scheduler.interval, custom_interval) + self.assertEqual(self.scheduler.path_in_repo, custom_path_in_repo) + self.assertEqual(self.scheduler.revision, custom_revision) + self.assertEqual(self.scheduler.allow_patterns, allow_patterns) + + # Check ignore patterns include git folders + expected_ignore = ignore_patterns + [ + '.git', '.git/*', '*/.git', '**/.git/**' + ] + self.assertEqual(self.scheduler.ignore_patterns, expected_ignore) + + # Verify create_repo was called with correct visibility + mock_create.assert_called_once() + call_args = mock_create.call_args + self.assertEqual(call_args.kwargs.get('visibility'), 'public') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @patch.object(CommitScheduler, 'commit_scheduled_changes') + def test_mocked_scheduler_execution(self, mock_push: MagicMock) -> None: + """Test scheduler with mocked commit_scheduled_changes method.""" + mock_push.return_value = CommitInfo( + commit_url= + 'https://modelscope.cn/datasets/test_scheduler_unit/commit/test123', + commit_message='Test commit', + commit_description='', + oid='test123') + + with patch.object(HubApi, 'create_repo'): + self.scheduler = CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + interval=1 / 60 / 10, # every 0.1s for fast testing + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + + # Wait for at least a couple of scheduler cycles + time.sleep(0.5) + + # Should have been called multiple times + self.assertGreater(len(mock_push.call_args_list), 1) + + # Check the last future result + if hasattr(self.scheduler, + 'last_future') and self.scheduler.last_future: + result = self.scheduler.last_future.result() + self.assertEqual(result.oid, 'test123') + self.assertEqual(result.commit_message, 'Test commit') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_scheduler_stop(self) -> None: + """Test stopping the scheduler.""" + with patch.object(HubApi, 'create_repo'): + self.scheduler = CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + interval=1, # 1 minute + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + + # Scheduler should be running + self.assertFalse(self.scheduler._CommitScheduler__stopped) + + # Stop the scheduler + self.scheduler.stop() + + # Scheduler should be stopped + self.assertTrue(self.scheduler._CommitScheduler__stopped) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_context_manager(self) -> None: + """Test CommitScheduler as context manager.""" + file_path = self.watched_folder / 'test_file.txt' + + with patch.object(HubApi, 'create_repo'), \ + patch.object(CommitScheduler, 'commit_scheduled_changes') as mock_push: + mock_push.return_value = CommitInfo( + commit_url= + 'https://modelscope.cn/datasets/test_scheduler_unit/commit/test123', + commit_message='Test commit', + commit_description='', + oid='test123') + + with CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + interval=5, # 5 minutes - won't trigger during test + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) as scheduler: + # Write a file inside the context + file_path.write_text('test content') + + # Reference for later assertions + self.scheduler = scheduler + + # After exiting context, scheduler should be stopped + self.assertTrue(self.scheduler._CommitScheduler__stopped) + + # Should have triggered commit_scheduled_changes on exit + mock_push.assert_called() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_trigger_manual_commit(self) -> None: + """Test manually triggering a commit.""" + with patch.object(HubApi, 'create_repo'), \ + patch.object(CommitScheduler, 'commit_scheduled_changes') as mock_push: + mock_push.return_value = CommitInfo( + commit_url= + 'https://modelscope.cn/datasets/test_scheduler_unit/commit/test123', + commit_message='Manual commit', + commit_description='', + oid='manual123') + + self.scheduler = CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + interval=60, # Long interval to avoid auto-trigger + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + future = self.scheduler.trigger() + result = future.result() + + # Verify the result + self.assertEqual(result.oid, 'manual123') + self.assertEqual(result.commit_message, 'Manual commit') + mock_push.assert_called() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_stopped_scheduler_no_push(self) -> None: + """Test that stopped scheduler doesn't perform push operations.""" + with patch.object(HubApi, 'create_repo'): + + self.scheduler = CommitScheduler( + repo_id=self.repo_id, + folder_path=self.watched_folder, + interval=60, + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + + # Stop the scheduler immediately + self.scheduler.stop() + + # Try to trigger after stopping + future = self.scheduler.trigger() + result = future.result() + + # Result should be None for stopped scheduler + self.assertIsNone(result) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_sync_local_folder_to_hub(self) -> None: + """Test sync local folder to remote repo.""" + hub_cache = self.cache_dir / 'hub' + file_path = self.watched_folder / 'file.txt' + bin_path = self.watched_folder / 'file.bin' + git_path = self.watched_folder / '.git' + self.scheduler = CommitScheduler( + folder_path=self.watched_folder, + repo_id=self.repo_id, + interval=1 / 60, + hub_api=self.api, + repo_type='dataset', + token=TEST_ACCESS_TOKEN1) + + # 1 push to hub triggered + time.sleep(0.5) + + # write content to files + with file_path.open('a') as f: + f.write('first line\n') + with bin_path.open('a') as f: + f.write('binary content') + with git_path.open('a') as f: + f.write('git content\n') + + # 2 push to hub triggered + time.sleep(2) + self.scheduler.last_future.result() + + # new content in file + with file_path.open('a') as f: + f.write('second line\n') + + # 1 push to hub triggered + time.sleep(1) + self.scheduler.last_future.result() + + with bin_path.open('a') as f: + f.write(' updated') + + # 5 push to hub triggered + time.sleep(5) # wait for every threads/uploads to complete + self.scheduler.stop() + self.scheduler.last_future.result() + + repo_id = self.scheduler.repo_id + + def _download(filename: str, revision: str) -> Path: + return Path( + _repo_file_download( + repo_id=repo_id, + file_path=filename, + revision=revision, + cache_dir=hub_cache, + repo_type='dataset')) + + # Check file.txt consistency + txt_push = _download(filename='file.txt', revision='master') + self.assertEqual(txt_push.read_text(), 'first line\nsecond line\n') + + # Check file.bin consistency + bin_push = _download(filename='file.bin', revision='master') + self.assertEqual(bin_push.read_text(), 'binary content updated') + + # Check .git file was not uploaded (should be ignored) + with self.assertRaises(NotExistError): + _download(filename='.git', revision='master') + + +class TestPartialFileIO(unittest.TestCase): + """Test suite for PartialFileIO functionality.""" + + def setUp(self) -> None: + """Set up test environment with temporary file.""" + self.cache_dir = Path(tempfile.mkdtemp()) + self.test_file = self.cache_dir / 'file.txt' + self.test_file.write_text('123456789abcdef') # 15 bytes + + def tearDown(self) -> None: + """Clean up test resources.""" + if self.cache_dir.exists(): + shutil.rmtree(self.cache_dir, ignore_errors=True) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_partial_file_read_with_limit(self) -> None: + """Test reading partial file with size limit.""" + partial_file = PartialFileIO(self.test_file, size_limit=5) + + # Should read only first 5 bytes + content = partial_file.read() + self.assertEqual(content, b'12345') + + # Second read should return empty + content = partial_file.read() + self.assertEqual(content, b'') + + partial_file.close() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_partial_file_read_by_chunks(self) -> None: + """Test reading partial file in chunks.""" + partial_file = PartialFileIO(self.test_file, size_limit=8) + + # Read in chunks + chunk1 = partial_file.read(3) + self.assertEqual(chunk1, b'123') + + chunk2 = partial_file.read(3) + self.assertEqual(chunk2, b'456') + + chunk3 = partial_file.read(3) + self.assertEqual(chunk3, b'78') # Only 2 bytes left within limit + + chunk4 = partial_file.read(3) + self.assertEqual(chunk4, b'') # Nothing left + + partial_file.close() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_partial_file_read_oversized_chunk(self) -> None: + """Test reading more than size limit in one go.""" + partial_file = PartialFileIO(self.test_file, size_limit=5) + + # Request more than limit + content = partial_file.read(20) + self.assertEqual(content, b'12345') # Should return only up to limit + + partial_file.close() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_partial_file_len(self) -> None: + """Test __len__ method returns correct size limit.""" + partial_file = PartialFileIO(self.test_file, size_limit=7) + self.assertEqual(len(partial_file), 7) + partial_file.close() + + # Size limit larger than actual file should be capped + large_partial = PartialFileIO(self.test_file, size_limit=100) + self.assertEqual(len(large_partial), 15) # Actual file size + large_partial.close() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_partial_file_seek_and_tell(self) -> None: + """Test seek and tell operations.""" + partial_file = PartialFileIO(self.test_file, size_limit=10) + + # Initial position + self.assertEqual(partial_file.tell(), 0) + + # Read some bytes + partial_file.read(3) + self.assertEqual(partial_file.tell(), 3) + + # Seek to beginning + partial_file.seek(0) + self.assertEqual(partial_file.tell(), 0) + + # Seek to specific position + partial_file.seek(5) + self.assertEqual(partial_file.tell(), 5) + + # Seek beyond limit should be capped + partial_file.seek(50) + self.assertEqual(partial_file.tell(), 10) # Capped at size limit + + # Seek from end + partial_file.seek(-2, SEEK_END) + self.assertEqual(partial_file.tell(), 8) # 10-2 + + partial_file.close() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_partial_file_not_implemented_methods(self) -> None: + """Test that unsupported methods raise NotImplementedError.""" + partial_file = PartialFileIO(self.test_file, size_limit=5) + + # These methods should not be implemented + with self.assertRaises(NotImplementedError): + partial_file.readline() + + with self.assertRaises(NotImplementedError): + partial_file.write(b'test') + + partial_file.close() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_partial_file_repr(self) -> None: + """Test string representation of PartialFileIO.""" + partial_file = PartialFileIO(self.test_file, size_limit=5) + repr_str = repr(partial_file) + + self.assertEqual( + repr_str, + f'') + + partial_file.close() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_file_modification_after_creation(self) -> None: + """Test that file modifications after PartialFileIO creation don't affect size limit.""" + partial_file = PartialFileIO(self.test_file, size_limit=20) + + # Original length is capped at file size + self.assertEqual(len(partial_file), 15) + + with self.test_file.open('ab') as f: + f.write(b'additional_content') + + # Size limit should remain the same (captured at creation) + self.assertEqual(len(partial_file), 15) + + # Content read should still be limited to original size + content = partial_file.read() + self.assertEqual(len(content), 15) + self.assertEqual(content, b'123456789abcdef') + + partial_file.close() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_high_size_limit(self) -> None: + """Test size limit larger than file size.""" + file = PartialFileIO(self.test_file, size_limit=20) + with self.test_file.open('ab') as f: + f.write(b'ghijkl') + + # File size limit is truncated to the actual file size at instance creation (not on the fly) + self.assertEqual(len(file), 15) + self.assertEqual(file._size_limit, 15) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_with_commit_operation_add(self) -> None: + """Test with CommitOperationAdd.""" + op_truncated = CommitOperationAdd( + path_or_fileobj=PartialFileIO(self.test_file, size_limit=5), + path_in_repo='test_file.txt') + self.assertEqual(op_truncated.upload_info.size, 5) + self.assertEqual(op_truncated.upload_info.sample, b'12345') + + with op_truncated.as_file() as f: + self.assertEqual(f.read(), b'12345') + + # Full file + op_full = CommitOperationAdd( + path_or_fileobj=PartialFileIO(self.test_file, size_limit=9), + path_in_repo='test_file.txt') + self.assertEqual(op_full.upload_info.size, 9) + self.assertEqual(op_full.upload_info.sample, b'123456789') + + with op_full.as_file() as f: + self.assertEqual(f.read(), b'123456789') + + # Truncated file has a different hash than the full file + self.assertNotEqual(op_truncated.upload_info.sha256, + op_full.upload_info.sha256) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/hub/test_hub_list.py b/tests/hub/test_hub_list.py new file mode 100644 index 00000000..91a2b5c0 --- /dev/null +++ b/tests/hub/test_hub_list.py @@ -0,0 +1,38 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope import HubApi +from modelscope.utils.logger import get_logger +from modelscope.utils.test_utils import test_level + +logger = get_logger() + +default_owner = 'modelscope' + + +class HubListHubTest(unittest.TestCase): + + def setUp(self): + self.api = HubApi() + + @unittest.skipUnless(test_level() >= 3, 'skip test in current test level') + def test_list_datasets(self): + # Use default args + result = self.api.list_datasets(owner_or_group=default_owner) + logger.info(f'List datasets result: {result}') + + @unittest.skipUnless(test_level() >= 3, 'skip test in current test level') + def test_list_datasets_with_args(self): + result = self.api.list_datasets( + owner_or_group=default_owner, + page_number=1, + page_size=2, + sort='downloads', + search='chinese', + ) + logger.info(f'List datasets with full result: {result}') + + def test_list_models(self): + result = self.api.list_models( + owner_or_group='Qwen', page_number=1, page_size=2) + logger.info(f'List models result: {result}') diff --git a/tests/tools/test_to_ollama.py b/tests/tools/test_to_ollama.py index 274fd56d..e3e925f0 100644 --- a/tests/tools/test_to_ollama.py +++ b/tests/tools/test_to_ollama.py @@ -122,6 +122,18 @@ class TestToOllama(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_check_template_type(self): + _test_check_tmpl_type( + 'unsloth/gpt-oss-20b-GGUF', + 'gpt-oss', + gguf_meta={'general.name': 'Gpt-Oss-20B'}) + _test_check_tmpl_type( + 'unsloth/granite-4.0-h-tiny-GGUF', + 'granite4', + gguf_meta={'general.name': 'Granite-4.0-H-Tiny'}) + _test_check_tmpl_type( + 'unsloth/DeepSeek-V3.1-GGUF', + 'deepseek-v3.1', + gguf_meta={'general.name': 'Deepseek-V3.1'}) _test_check_tmpl_type('unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF', 'qwen3-coder') _test_check_tmpl_type(