Add create_repo and upload UTs (#1282)

* add delete_repo, delete_dataset, exists_ok in create_repo

* add UT for create_repo
This commit is contained in:
Xingjun.Wang
2025-04-07 10:24:26 +08:00
committed by GitHub
parent e85fe5e885
commit 6adc8614c9
6 changed files with 302 additions and 177 deletions

View File

@@ -226,7 +226,7 @@ class HubApi:
headers=self.builder_headers(self.headers))
handle_http_post_error(r, path, body)
raise_on_error(r.json())
model_repo_url = f'{endpoint}/{model_id}'
model_repo_url = f'{endpoint}/models/{model_id}'
return model_repo_url
def delete_model(self, model_id: str, endpoint: Optional[str] = None):
@@ -401,6 +401,33 @@ class HubApi:
'Failed to check existence of repo: %s, make sure you have access authorization.'
% repo_type)
def delete_repo(self, repo_id: str, repo_type: str, endpoint: Optional[str] = None):
"""
Delete a repository from ModelScope.
Args:
repo_id (`str`):
A namespace (user or an organization) and a repo name separated
by a `/`.
repo_type (`str`):
The type of the repository. Supported types are `model` and `dataset`.
endpoint(`str`):
The endpoint to use. If not provided, the default endpoint is `https://www.modelscope.cn`
Could be set to `https://ai.modelscope.ai` for international version.
"""
if not endpoint:
endpoint = self.endpoint
if repo_type == REPO_TYPE_DATASET:
self.delete_dataset(repo_id, endpoint)
elif repo_type == REPO_TYPE_MODEL:
self.delete_model(repo_id, endpoint)
else:
raise Exception(f'Arg repo_type {repo_type} not supported.')
logger.info(f'Repo {repo_id} deleted successfully.')
@staticmethod
def _create_default_config(model_dir):
cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
@@ -924,6 +951,21 @@ class HubApi:
dataset_list = r.json()[API_RESPONSE_FIELD_DATA]
return [x['Name'] for x in dataset_list]
def delete_dataset(self, dataset_id: str, endpoint: Optional[str] = None):
cookies = ModelScopeConfig.get_cookies()
if not endpoint:
endpoint = self.endpoint
if cookies is None:
raise ValueError('Token does not exist, please login first.')
path = f'{endpoint}/api/v1/datasets/{dataset_id}'
r = self.session.delete(path,
cookies=cookies,
headers=self.builder_headers(self.headers))
raise_for_http_status(r)
raise_on_error(r.json())
def get_dataset_id_and_type(self, dataset_name: str, namespace: str, endpoint: Optional[str] = None):
""" Get the dataset id and type. """
if not endpoint:
@@ -1361,15 +1403,42 @@ class HubApi:
chinese_name: Optional[str] = '',
license: Optional[str] = Licenses.APACHE_V2,
endpoint: Optional[str] = None,
exist_ok: Optional[bool] = False,
**kwargs,
) -> str:
"""
Create a repository on the ModelScope Hub.
Args:
repo_id (str): The repo id in the format of `owner_name/repo_name`.
token (Union[str, bool, None]): The access token.
visibility (Optional[str]): The visibility of the repo,
could be `public`, `private`, `internal`, default to `public`.
repo_type (Optional[str]): The repo type, default to `model`.
chinese_name (Optional[str]): The Chinese name of the repo.
license (Optional[str]): The license of the repo, default to `apache-2.0`.
endpoint (Optional[str]): The endpoint to use.
In the format of `https://www.modelscope.cn` or 'https://www.modelscope.ai'
exist_ok (Optional[bool]): If the repo exists, whether to return the repo url directly.
**kwargs: The additional arguments.
Returns:
str: The repo url.
"""
# TODO: exist_ok
if not repo_id:
raise ValueError('Repo id cannot be empty!')
if not endpoint:
endpoint = self.endpoint
self.login(access_token=token)
repo_exists: bool = self.repo_exists(repo_id, repo_type=repo_type, endpoint=endpoint)
if repo_exists:
if exist_ok:
return f'{endpoint}/{repo_type}s/{repo_id}'
else:
raise ValueError(f'Repo {repo_id} already exists!')
self.login(access_token=token, endpoint=endpoint)
repo_id_list = repo_id.split('/')
if len(repo_id_list) != 2:
@@ -1382,31 +1451,28 @@ class HubApi:
if visibility is None:
raise ValueError(f'Invalid visibility: {visibility}, '
f'supported visibilities: `public`, `private`, `internal`')
if not self.repo_exists(repo_id, repo_type=repo_type):
repo_url: str = self.create_model(
model_id=repo_id,
visibility=visibility,
license=license,
chinese_name=chinese_name,
)
with tempfile.TemporaryDirectory() as temp_cache_dir:
from modelscope.hub.repository import Repository
repo = Repository(temp_cache_dir, repo_id)
default_config = {
'framework': 'pytorch',
'task': 'text-generation',
'allow_remote': True
}
config_json = kwargs.get('config_json')
if not config_json:
config_json = {}
config = {**default_config, **config_json}
add_content_to_file(
repo,
'configuration.json', [json.dumps(config)],
ignore_push_error=True)
else:
repo_url = f'{endpoint}/{repo_id}'
repo_url: str = self.create_model(
model_id=repo_id,
visibility=visibility,
license=license,
chinese_name=chinese_name,
)
with tempfile.TemporaryDirectory() as temp_cache_dir:
from modelscope.hub.repository import Repository
repo = Repository(temp_cache_dir, repo_id)
default_config = {
'framework': 'pytorch',
'task': 'text-generation',
'allow_remote': True
}
config_json = kwargs.get('config_json')
if not config_json:
config_json = {}
config = {**default_config, **config_json}
add_content_to_file(
repo,
'configuration.json', [json.dumps(config)],
ignore_push_error=True)
elif repo_type == REPO_TYPE_DATASET:
visibilities = {k: v for k, v in DatasetVisibility.__dict__.items() if not k.startswith('__')}
@@ -1414,20 +1480,19 @@ class HubApi:
if visibility is None:
raise ValueError(f'Invalid visibility: {visibility}, '
f'supported visibilities: `public`, `private`, `internal`')
if not self.repo_exists(repo_id, repo_type=repo_type):
repo_url: str = self.create_dataset(
dataset_name=repo_name,
namespace=namespace,
chinese_name=chinese_name,
license=license,
visibility=visibility,
)
else:
repo_url = f'{endpoint}/datasets/{namespace}/{repo_name}'
repo_url: str = self.create_dataset(
dataset_name=repo_name,
namespace=namespace,
chinese_name=chinese_name,
license=license,
visibility=visibility,
)
else:
raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
logger.info(f'Repo created: {repo_url}')
return repo_url
def create_commit(

View File

@@ -323,7 +323,8 @@ class UploadInfo:
file_hash_info = file_hash_info or get_file_hash(path)
size = file_hash_info['file_size']
sha = file_hash_info['file_hash']
sample = open(path, 'rb').read(512)
with open(path, 'rb') as f:
sample = f.read(512)
return cls(sha256=sha, size=size, sample=sample)

View File

@@ -0,0 +1,58 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import unittest
import uuid
from modelscope import HubApi
from modelscope.utils.constant import REPO_TYPE_DATASET, REPO_TYPE_MODEL
from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import TEST_ACCESS_TOKEN1
from modelscope.utils.test_utils import TEST_MODEL_ORG as TEST_ORG
from modelscope.utils.test_utils import delete_credential, test_level
logger = get_logger()
class TestCreateRepo(unittest.TestCase):
def setUp(self):
self.api = HubApi()
self.api.login(TEST_ACCESS_TOKEN1)
self.repo_id_model: str = f'{TEST_ORG}/test_create_repo_model_{uuid.uuid4().hex[-6:]}'
self.repo_id_dataset: str = f'{TEST_ORG}/test_create_repo_dataset_{uuid.uuid4().hex[-6:]}'
def tearDown(self):
self.api.delete_repo(
repo_id=self.repo_id_model, repo_type=REPO_TYPE_MODEL)
self.api.delete_repo(
repo_id=self.repo_id_dataset, repo_type=REPO_TYPE_DATASET)
delete_credential()
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_create_repo(self):
logger.info(
f'TEST: Creating repo {self.repo_id_model} and {self.repo_id_dataset} ...'
)
try:
self.api.create_repo(
repo_id=self.repo_id_model,
repo_type=REPO_TYPE_MODEL,
exist_ok=True)
except Exception as e:
logger.error(f'Failed to create repo {self.repo_id_model} !')
raise e
try:
self.api.create_repo(
repo_id=self.repo_id_dataset,
repo_type=REPO_TYPE_DATASET,
exist_ok=True)
except Exception as e:
logger.error(f'Failed to create repo {self.repo_id_dataset} !')
raise e
logger.info(
f'TEST: Created repo {self.repo_id_model} and {self.repo_id_dataset} successfully !'
)

View File

@@ -0,0 +1,138 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import shutil
import struct
import tempfile
import unittest
import uuid
import json
from modelscope import HubApi
from modelscope.utils.constant import REPO_TYPE_DATASET, REPO_TYPE_MODEL
from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import TEST_ACCESS_TOKEN1
from modelscope.utils.test_utils import TEST_MODEL_ORG as TEST_ORG
from modelscope.utils.test_utils import delete_credential, test_level
logger = get_logger()
class TestUploadFileFolder(unittest.TestCase):
def setUp(self):
self.api = HubApi()
self.api.login(TEST_ACCESS_TOKEN1)
self.repo_id_model: str = f'{TEST_ORG}/test_upload_file_folder_model_{uuid.uuid4().hex[-6:]}'
self.repo_id_dataset: str = f'{TEST_ORG}/test_upload_file_folder_dataset_{uuid.uuid4().hex[-6:]}'
self.work_dir = tempfile.mkdtemp()
self.model_file_path = f'{self.work_dir}/test_model.bin'
self.dataset_file_path = f'{self.work_dir}/test_data.jsonl'
logger.info(f'Work directory: {self.work_dir}')
self.api.create_repo(
repo_id=self.repo_id_model,
repo_type=REPO_TYPE_MODEL,
exist_ok=True)
self.api.create_repo(
repo_id=self.repo_id_dataset,
repo_type=REPO_TYPE_DATASET,
exist_ok=True)
self._construct_file()
def tearDown(self):
# Remove repositories
self.api.delete_repo(
repo_id=self.repo_id_model, repo_type=REPO_TYPE_MODEL)
self.api.delete_repo(
repo_id=self.repo_id_dataset, repo_type=REPO_TYPE_DATASET)
# Clean up the temporary credentials
delete_credential()
# Clean up the temporary directory
shutil.rmtree(self.work_dir)
def _construct_file(self):
# Construct data
data_list = [
{
'id': 1,
'value': 3.14
},
{
'id': 2,
'value': 2.71
},
{
'id': 3,
'value': 3.69
},
{
'id': 4,
'value': 9.31
},
{
'id': 5,
'value': 1.21
},
]
with open(self.model_file_path, 'wb') as f:
for entry in data_list:
packed_data = struct.pack('if', entry['id'], entry['value'])
f.write(packed_data)
logger.info(f'Constructed model file: {self.model_file_path}')
with open(self.dataset_file_path, 'w') as f:
for entry in data_list:
f.write(json.dumps(entry) + '\n')
logger.info(f'Constructed dataset file: {self.dataset_file_path}')
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_upload_file_folder(self):
"""
Test uploading file/folder to the model/dataset repository.
"""
commit_info_upload_file_model = self.api.upload_file(
path_or_fileobj=self.model_file_path,
path_in_repo=os.path.basename(self.model_file_path),
repo_id=self.repo_id_model,
repo_type=REPO_TYPE_MODEL,
commit_message='Add model file for CI_TEST',
)
self.assertTrue(commit_info_upload_file_model is not None)
commit_info_upload_file_dataset = self.api.upload_file(
path_or_fileobj=self.dataset_file_path,
path_in_repo=os.path.basename(self.dataset_file_path),
repo_id=self.repo_id_dataset,
repo_type=REPO_TYPE_DATASET,
commit_message='Add dataset file for CI_TEST',
)
self.assertTrue(commit_info_upload_file_dataset is not None)
commit_info_upload_folder_model = self.api.upload_folder(
repo_id=self.repo_id_model,
folder_path=self.work_dir,
path_in_repo='test_data',
repo_type=REPO_TYPE_MODEL,
commit_message='Add model folder for CI_TEST',
)
self.assertTrue(commit_info_upload_folder_model is not None)
commit_info_upload_folder_dataset = self.api.upload_folder(
repo_id=self.repo_id_dataset,
folder_path=self.work_dir,
path_in_repo='test_data',
repo_type=REPO_TYPE_DATASET,
commit_message='Add dataset folder for CI_TEST',
)
self.assertTrue(commit_info_upload_folder_dataset is not None)

View File

@@ -1,137 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import shutil
import tempfile
import unittest
import zipfile
from modelscope.msdatasets import MsDataset
from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects
from modelscope.utils import logger as logging
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode,
ModelFile)
from modelscope.utils.test_utils import test_level
logger = logging.get_logger()
KEY_EXTRACTED = 'extracted'
class DatasetUploadTest(unittest.TestCase):
def setUp(self):
self.old_dir = os.getcwd()
self.dataset_name = 'small_coco_for_test'
self.dataset_file_name = self.dataset_name
self.prepared_dataset_name = 'pets_small'
self.token = os.getenv('TEST_UPLOAD_MS_TOKEN')
error_msg = 'The modelscope token can not be empty, please set env variable: TEST_UPLOAD_MS_TOKEN'
self.assertIsNotNone(self.token, msg=error_msg)
from modelscope.hub.api import HubApi
from modelscope.hub.api import ModelScopeConfig
self.api = HubApi()
self.api.login(self.token)
# get user info
self.namespace, _ = ModelScopeConfig.get_user_info()
self.temp_dir = tempfile.mkdtemp()
self.test_work_dir = os.path.join(self.temp_dir, self.dataset_name)
self.test_meta_dir = os.path.join(self.test_work_dir, 'meta')
if not os.path.exists(self.test_work_dir):
os.makedirs(self.test_work_dir)
def tearDown(self):
os.chdir(self.old_dir)
shutil.rmtree(self.temp_dir, ignore_errors=True)
logger.info(
f'Temporary directory {self.temp_dir} successfully removed!')
@staticmethod
def get_raw_downloaded_file_path(extracted_path):
raw_downloaded_file_path = ''
raw_data_dir = os.path.abspath(
os.path.join(extracted_path, '../../..'))
for root, dirs, files in os.walk(raw_data_dir):
if KEY_EXTRACTED in dirs:
for file in files:
curr_file_path = os.path.join(root, file)
if zipfile.is_zipfile(curr_file_path):
raw_downloaded_file_path = curr_file_path
return raw_downloaded_file_path
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ds_upload(self):
# Get the prepared data from hub, using default modelscope namespace
ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
config_res = ms_ds_train._hf_ds.config_kwargs
extracted_path = config_res.get('split_config').get('train')
raw_zipfile_path = self.get_raw_downloaded_file_path(extracted_path)
MsDataset.upload(
object_name=self.dataset_file_name + '.zip',
local_file_path=raw_zipfile_path,
dataset_name=self.dataset_name,
namespace=self.namespace)
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ds_upload_dir(self):
ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
config_train = ms_ds_train._hf_ds.config_kwargs
extracted_path_train = config_train.get('split_config').get('train')
MsDataset.upload(
object_name='train',
local_file_path=os.path.join(extracted_path_train,
'Pets/images/train'),
dataset_name=self.dataset_name,
namespace=self.namespace)
MsDataset.upload(
object_name='val',
local_file_path=os.path.join(extracted_path_train,
'Pets/images/val'),
dataset_name=self.dataset_name,
namespace=self.namespace)
objects = list_dataset_objects(
hub_api=self.api,
max_limit=-1,
is_recursive=True,
dataset_name=self.dataset_name,
namespace=self.namespace,
version=DEFAULT_DATASET_REVISION)
logger.info(f'{len(objects)} objects have been uploaded: {objects}')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ds_download_dir(self):
test_ds = MsDataset.load(
self.dataset_name,
namespace=self.namespace,
download_mode=DownloadMode.FORCE_REDOWNLOAD)
assert test_ds.config_kwargs['split_config'].values()
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ds_clone_meta(self):
MsDataset.clone_meta(
dataset_work_dir=self.test_meta_dir,
dataset_id=os.path.join(self.namespace, self.dataset_name))
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ds_upload_meta(self):
# Clone dataset meta repo first.
MsDataset.clone_meta(
dataset_work_dir=self.test_meta_dir,
dataset_id=os.path.join(self.namespace, self.dataset_name))
with open(os.path.join(self.test_meta_dir, ModelFile.README),
'a') as f:
f.write('\nThis is a line for unit test.')
MsDataset.upload_meta(
dataset_work_dir=self.test_meta_dir,
commit_message='Update for unit test.')
if __name__ == '__main__':
unittest.main()

View File

@@ -1,5 +1,5 @@
# isolate cases in env, we can install different dependencies in each env.
isolated: # test cases that may require excessive anmount of GPU memory or run long time, which will be executed in dedicagted process.
isolated: # test cases that may require excessive amount of GPU memory or run long time, which will be executed in dedicated process.
- test_text_to_speech.py
- test_multi_modal_embedding.py
- test_ofa_tasks.py