add release 1.6.1

This commit is contained in:
xingjun.wang
2023-06-09 20:43:37 +08:00
parent 48c0d2a9af
commit 1c732da386
23 changed files with 345 additions and 63 deletions

View File

@@ -2,11 +2,11 @@ PYTHONPATH=. torchrun examples/pytorch/stable_diffusion/finetune_stable_diffusio
--model 'damo/multi-modal_efficient-diffusion-tuning-lora' \
--work_dir './tmp/stable_diffusion_tuning' \
--train_dataset_namespace 'damo' \
--train_dataset_name 'controlnet_dataset_condition_fill50k' \
--max_epochs 1 \
--train_dataset_name 'buptwq/lora-stable-diffusion-finetune-dog' \
--max_epochs 150 \
--save_ckpt_strategy 'by_epoch' \
--logging_interval 100 \
--train.dataloader.workers_per_gpu 0 \
--evaluation.dataloader.workers_per_gpu 0 \
--train.optimizer.lr 1e-5 \
--train.optimizer.lr 1e-4 \
--use_model_config true

View File

@@ -1,6 +1,7 @@
from dataclasses import dataclass, field
from modelscope import EpochBasedTrainer, MsDataset, TrainingArgs
from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
build_dataset_from_file)
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
@@ -94,14 +95,25 @@ def cfg_modify_fn(cfg):
return cfg
dataset = MsDataset.load(args.train_dataset_name)
train_dataset = dataset['train']
eval_dataset = dataset['validation' if 'validation' in dataset else 'test']
if args.dataset_json_file is None:
train_dataset = MsDataset.load(
args.train_dataset_name,
subset_name=args.train_subset_name,
split=args.train_split,
namespace=args.train_dataset_namespace)
validation_dataset = MsDataset.load(
args.val_dataset_name,
subset_name=args.val_subset_name,
split=args.val_split,
namespace=args.val_dataset_namespace)
else:
train_dataset, validation_dataset = build_dataset_from_file(
args.dataset_json_file)
kwargs = dict(
model=args.model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
eval_dataset=validation_dataset,
seed=args.seed,
work_dir=args.work_dir,
cfg_modify_fn=cfg_modify_fn)

View File

@@ -9,6 +9,9 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
--work_dir './tmp' \
--model 'damo/nlp_gpt3_text-generation_1.3B' \
--train_dataset_name 'chinese-poetry-collection' \
--val_dataset_name 'chinese-poetry-collection' \
--train_split 'train' \
--val_split 'test' \
--preprocessor 'text-gen-jieba-tokenizer' \
--src_txt 'text1' \
--tgt_txt 'text2' \

View File

@@ -4,6 +4,9 @@ PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.
--task 'text2text-generation' \
--model 'damo/nlp_mt5_zero-shot-augment_chinese-base' \
--train_dataset_name 'DuReader_robust-QG' \
--val_dataset_name 'DuReader_robust-QG' \
--train_split 'train' \
--val_split 'validation' \
--src_txt 'text1' \
--tgt_txt 'text2' \
--max_epochs 1 \

View File

@@ -3,6 +3,9 @@ PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.
--work_dir './tmp' \
--model 'damo/nlp_palm2.0_pretrained_chinese-base' \
--train_dataset_name 'DuReader_robust-QG' \
--val_dataset_name 'DuReader_robust-QG' \
--train_split 'train' \
--val_split 'validation' \
--src_txt 'text1' \
--tgt_txt 'text2' \
--max_epochs 1 \

View File

@@ -13,6 +13,9 @@ from modelscope.metainfo import Models
from modelscope.utils.constant import ModelFile, Tasks
@EXPORTERS.register_module(
Tasks.domain_specific_object_detection,
module_name=Models.tinynas_damoyolo)
@EXPORTERS.register_module(
Tasks.image_object_detection, module_name=Models.tinynas_damoyolo)
class ObjectDetectionDamoyoloExporter(TorchModelExporter):

View File

@@ -16,6 +16,7 @@ from http.cookiejar import CookieJar
from os.path import expanduser
from typing import Dict, List, Optional, Tuple, Union
import pandas as pd
import requests
from requests import Session
from requests.adapters import HTTPAdapter, Retry
@@ -651,6 +652,41 @@ class HubApi:
return text_headers, text_content
@staticmethod
def fetch_csv_from_url(url, out_path, chunk_size=1000000):
from io import StringIO
import hashlib
out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest())
if os.path.exists(out_path):
return out_path
cookies = ModelScopeConfig.get_cookies()
# Make the request and get the response content as TextIO
response = requests.get(url, cookies=cookies)
data = StringIO(response.text)
# Use read_csv with the TextIO object
csv_file_reader = pd.read_csv(data, iterator=True, dtype=str, delimiter=None)
loop = True
iter_num = 0
while loop:
try:
chunk = csv_file_reader.get_chunk(size=chunk_size)
if iter_num == 0:
with_header = True
else:
with_header = False
chunk.to_csv(out_path, mode='a', index=False, header=with_header)
iter_num += 1
except StopIteration:
loop = False
logger.info('stop chunk iteration')
return out_path
def get_dataset_file_url(
self,
file_name: str,

View File

@@ -2,6 +2,8 @@
import concurrent.futures
import os
import shutil
from multiprocessing import Manager, Process, Value
from modelscope.hub.api import HubApi
from modelscope.hub.constants import ModelVisibility
@@ -11,6 +13,10 @@ from modelscope.utils.logger import get_logger
logger = get_logger()
_executor = concurrent.futures.ProcessPoolExecutor(max_workers=8)
_queues = dict()
_flags = dict()
_tasks = dict()
_manager = None
def _api_push_to_hub(repo_name,
@@ -131,3 +137,64 @@ def push_to_hub_async(repo_name,
return _executor.submit(_api_push_to_hub, repo_name, output_dir, token,
private, commit_message, tag, source_repo,
ignore_file_pattern, revision)
def submit_task(q, b):
while True:
b.value = False
item = q.get()
logger.info(item)
b.value = True
if not item.pop('done', False):
delete_dir = item.pop('delete_dir', False)
output_dir = item.get('output_dir')
try:
push_to_hub(**item)
if delete_dir and os.path.exists(output_dir):
shutil.rmtree(output_dir)
except Exception as e:
logger.error(e)
else:
break
class UploadStrategy:
cancel = 'cancel'
wait = 'wait'
def push_to_hub_in_queue(queue_name, strategy=UploadStrategy.cancel, **kwargs):
assert queue_name is not None and len(
queue_name) > 0, 'Please specify a valid queue name!'
global _manager
if _manager is None:
_manager = Manager()
if queue_name not in _queues:
_queues[queue_name] = _manager.Queue()
_flags[queue_name] = Value('b', False)
process = Process(
target=submit_task, args=(_queues[queue_name], _flags[queue_name]))
process.start()
_tasks[queue_name] = process
queue = _queues[queue_name]
flag: Value = _flags[queue_name]
if kwargs.get('done', False):
queue.put(kwargs)
elif flag.value and strategy == UploadStrategy.cancel:
logger.error(
f'Another uploading is running, '
f'this uploading with message {kwargs.get("commit_message")} will be canceled.'
)
else:
queue.put(kwargs)
def wait_for_done(queue_name):
process: Process = _tasks.pop(queue_name, None)
if process is None:
return
process.join()
_queues.pop(queue_name)
_flags.pop(queue_name)

View File

@@ -346,6 +346,7 @@ def merge_outputs(detections):
def filter(results, logi, ps):
# this function select boxes
batch_size, feat_dim = logi.shape[0], logi.shape[2]
num_valid = sum(results[1][:, 8] >= 0.15)
slct_logi = np.zeros((batch_size, num_valid, feat_dim), dtype=np.float32)
@@ -358,6 +359,14 @@ def filter(results, logi, ps):
return torch.Tensor(slct_logi).cuda(), torch.Tensor(slct_dets).cuda()
def normalized_ps(ps, vocab_size):
ps = torch.round(ps).to(torch.int64)
ps = torch.where(ps < vocab_size, ps, (vocab_size - 1)
* torch.ones(ps.shape).to(torch.int64).cuda())
ps = torch.where(ps >= 0, ps, torch.zeros(ps.shape).to(torch.int64).cuda())
return ps
def process_detect_output(output, meta):
K, MK = 3000, 5000
num_classes = 2
@@ -390,6 +399,7 @@ def process_detect_output(output, meta):
logi = logi + cr
results = merge_outputs(detections)
slct_logi_feat, slct_dets_feat = filter(results, logi, raw_dets[:, :, :8])
slct_dets_feat = normalized_ps(slct_dets_feat, 256)
slct_output_dets = results[1][:slct_logi_feat.shape[1], :8]
return slct_logi_feat, slct_dets_feat, slct_output_dets

View File

@@ -86,7 +86,6 @@ class OssDownloader(BaseDownloader):
def _authorize(self) -> None:
""" Authorization of target dataset.
Get credentials from cache and send to the modelscope-hub in the future. """
# TODO: obtain credentials from loacl cache when available.
cookies = ModelScopeConfig.get_cookies()
git_token = ModelScopeConfig.get_token()
user_info = ModelScopeConfig.get_user_info()

View File

@@ -127,15 +127,15 @@ class RemoteDataLoaderManager(DataLoaderManager):
return dataset_ret
# To use the modelscope data loader
elif data_loader_type == RemoteDataLoaderType.MS_DATA_LOADER:
oss_data_loader = OssDownloader(
oss_downloader = OssDownloader(
dataset_context_config=self.dataset_context_config)
oss_data_loader.process()
oss_downloader.process()
# download statistics
self.api.dataset_download_statistics(
dataset_name=dataset_name,
namespace=namespace,
use_streaming=use_streaming)
return oss_data_loader.dataset
return oss_downloader.dataset
else:
raise f'Expected remote data loader type: {RemoteDataLoaderType.HF_DATA_LOADER.value}/' \
f'{RemoteDataLoaderType.MS_DATA_LOADER.value}, but got {data_loader_type} .'

View File

@@ -6,8 +6,9 @@ from typing import Dict, Union
import datasets
import pandas as pd
import pyarrow as pa
from datasets import (ArrowBasedBuilder, GeneratorBasedBuilder,
IterableDataset, IterableDatasetDict)
from datasets import (ArrowBasedBuilder, Dataset, DatasetDict,
GeneratorBasedBuilder, IterableDataset,
IterableDatasetDict)
from datasets.filesystems import is_remote_filesystem
from datasets.info import DatasetInfo
from datasets.naming import camelcase_to_snakecase
@@ -47,6 +48,7 @@ class CsvDatasetBuilder(csv.Csv):
self.meta_data_files = dataset_context_config.data_meta_config.meta_data_files
self.zip_data_files = dataset_context_config.data_meta_config.zip_data_files
self.input_config_kwargs = dataset_context_config.config_kwargs
self.split_path_dict = dict({})
self.cache_build_dir = os.path.join(self.cache_root_dir,
self.namespace, self.dataset_name,
@@ -61,16 +63,25 @@ class CsvDatasetBuilder(csv.Csv):
sub_dir_hash = get_subdir_hash_from_split(
split=split, version=self.version)
from datasets.data_files import DataFilesDict, DataFilesList
data_files = {
k: DataFilesList([v], origin_metadata=None)
for k, v in self.meta_data_files.items()
}
data_files = DataFilesDict.from_local_or_remote(data_files)
super().__init__(
cache_dir=self.cache_build_dir,
config_name=self.namespace,
hash=sub_dir_hash,
data_files=self.meta_data_files,
data_files=data_files,
**self.input_config_kwargs)
self.info.builder_name = self.dataset_name
self.name = camelcase_to_snakecase(self.dataset_name)
self.local_meta_csv_paths: dict = dict({})
def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE):
builder_data_dir = os.path.join(
self._cache_dir_root,
@@ -147,6 +158,87 @@ class CsvDatasetBuilder(csv.Csv):
f"Failed to read file '{file}' with error {type(e)}: {e}")
raise
def download_and_prepare(self, download_mode, dl_manager,
**download_kwargs):
target_cache_dir = dl_manager.download_config.cache_dir
split_name = dl_manager.download_config.split
if not split_name:
split_name = DatasetPathName.LOCK_FILE_NAME_ANY
version_name = dl_manager.download_config.version
if not version_name:
version_name = DatasetPathName.LOCK_FILE_NAME_ANY
subset_name = self.subset_name
if not subset_name:
subset_name = DatasetPathName.LOCK_FILE_NAME_ANY
# Prevent parallel disk operations
lock_file_names = []
lock_file_names.append(DatasetPathName.DATA_FILES_NAME)
lock_file_names.append(dl_manager.download_config.dataset_name)
lock_file_names.append(version_name)
lock_file_names.append(subset_name)
lock_file_names.append(split_name)
lock_file_name = DatasetPathName.LOCK_FILE_NAME_DELIMITER.join(
lock_file_names)
lock_path = os.path.join(
target_cache_dir.strip(DatasetPathName.DATA_FILES_NAME),
lock_file_name + '.lock')
with FileLock(lock_path):
data_exists = os.path.exists(target_cache_dir)
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS.value:
logger.warning(
f'Reusing dataset {self.name} ({target_cache_dir})')
logger.info(f'Generating dataset {self.name} ({target_cache_dir})')
self._download_and_prepare(
dl_manager=dl_manager, download_mode=download_mode)
def _download_and_prepare(self, dl_manager, download_mode):
import shutil
target_cache_dir = dl_manager.download_config.cache_dir
if download_mode == DownloadMode.FORCE_REDOWNLOAD.value:
shutil.rmtree(target_cache_dir, ignore_errors=True)
os.makedirs(target_cache_dir, exist_ok=True)
self.local_meta_csv_paths = {
k: HubApi.fetch_csv_from_url(v, target_cache_dir)
for k, v in self.meta_data_files.items()
}
self.split_path_dict = dl_manager.download_and_extract(
self.zip_data_files)
def _convert_csv_to_dataset(self, split_name, csv_file_path):
df = pd.read_csv(
csv_file_path, iterator=False, delimiter=self.csv_delimiter)
transform_fields = []
for field_name in df.columns.tolist():
if field_name.endswith(':FILE'):
transform_fields.append(field_name)
base_extracted_dir = self.split_path_dict.get(split_name, '')
for field_name in transform_fields:
if base_extracted_dir:
df[field_name] = df[field_name].apply(
lambda x: os.path.join(base_extracted_dir, x))
pa_data = pa.Table.from_pandas(df)
return Dataset(arrow_table=pa_data)
def as_dataset(self) -> DatasetDict:
return DatasetDict({
k: self._convert_csv_to_dataset(k, v)
for k, v in self.local_meta_csv_paths.items()
})
class TaskSpecificDatasetBuilder(CsvDatasetBuilder):
@@ -181,7 +273,7 @@ class TaskSpecificDatasetBuilder(CsvDatasetBuilder):
self._cache_dir.replace(os.sep, '_') + '.lock')
with FileLock(lock_path):
data_exists = os.path.exists(self._cache_dir)
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS: # TODO: .value??
logger.warning(
f'Reusing dataset {self.name} ({self._cache_dir})')
return

View File

@@ -170,6 +170,7 @@ class MsDataset:
cache_dir: Optional[str] = MS_DATASETS_CACHE,
use_streaming: Optional[bool] = False,
custom_cfg: Optional[Config] = Config(),
token: Optional[str] = None,
**config_kwargs,
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
@@ -197,12 +198,18 @@ class MsDataset:
NativeIterableDataset or a dict of NativeIterableDataset.
custom_cfg (str, Optional): Model configuration, this can be used for custom datasets.
see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
token (str, Optional): SDK token of ModelScope.
**config_kwargs (additional keyword arguments): Keyword arguments to be passed
Returns:
MsDataset (MsDataset): MsDataset object for a certain dataset.
"""
if token:
from modelscope.hub.api import HubApi
api = HubApi()
api.login(token)
download_mode = DownloadMode(download_mode
or DownloadMode.REUSE_DATASET_IF_EXISTS)
hub = Hubs(hub or Hubs.modelscope)

View File

@@ -388,10 +388,14 @@ class TokenClassificationTransformersPreprocessor(
f'tokenizer {tokenizer_name}, please use a fast tokenizer instead, or '
f'try to implement a `{method}` method')
label_mask, offset_mapping = getattr(self, method)(tokens)
padding = self.nlp_tokenizer.get_tokenizer_kwarg('padding')
max_length = self.nlp_tokenizer.get_tokenizer_kwarg('max_length')
special_token = 1 if self.nlp_tokenizer.get_tokenizer_kwarg(
'add_special_tokens') else 0
padding = kwargs.get('padding',
self.nlp_tokenizer.get_tokenizer_kwarg('padding'))
max_length = kwargs.get(
'max_length', self.nlp_tokenizer.get_tokenizer_kwarg('max_length'))
special_token = 1 if kwargs.get(
'add_special_tokens',
self.nlp_tokenizer.get_tokenizer_kwarg(
'add_special_tokens')) else 0
if len(label_mask) > max_length - 2 * special_token:
label_mask = label_mask[:(max_length - 2 * special_token)]
offset_mapping = offset_mapping[:sum(label_mask)]

View File

@@ -57,7 +57,7 @@ def update_cfg(cfg: Config) -> Config:
key_chain_map[_HOOK_KEY_CHAIN_MAP[key]] = value
hook.clear()
cfg.train.hooks = list(filter(bool, cfg.train.hooks))
cfg.merge_from_dict(key_chain_map)
cfg.merge_from_dict(key_chain_map, force=False)
return cfg

View File

@@ -1,14 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import random
import time
import shutil
from typing import Optional
import json
import numpy as np
import torch
from modelscope.hub.check_model import check_model_is_id
from modelscope.hub.push_to_hub import push_to_hub_async
from modelscope.hub.push_to_hub import (UploadStrategy, push_to_hub_in_queue,
wait_for_done)
from modelscope.metainfo import Hooks
from modelscope.trainers.hooks.builder import HOOKS
from modelscope.trainers.hooks.checkpoint.checkpoint_processor import \
@@ -45,7 +47,9 @@ class CheckpointHook(Hook):
hub_repo_id (str): The hub repo id.
hub_token (str): The token of the modelhub. You can also set the environment variable `MODELSCOPE_API_TOKEN`.
private_hub (bool): Whether push to a private hub, default True.
hub_revision (str): Which branch to push the model to, default is `master`
hub_revision (str): Which branch to push the model to, default is `master`.
upload_strategy (str): The action adopted when the previous uploading is not done
and the next one is coming, can be `cancel` or `wait`.
kwargs:
by_epoch (bool): Same with `save_strategy`, but has a higher priority, legacy argument.
output_sub_dir (str): The folder under the `save_dir` to save the output checkpoint for inference.
@@ -56,6 +60,8 @@ class CheckpointHook(Hook):
EVAL_RESULT_FILE = 'eval_result.txt'
PUSH_TO_HUB_QUEUE_NAME = 'train.checkpoint'
def __init__(self,
save_strategy: Optional[str] = CheckpointStrategy.by_epoch,
interval: Optional[int] = 0,
@@ -68,6 +74,7 @@ class CheckpointHook(Hook):
hub_token: Optional[str] = None,
private_hub: Optional[bool] = True,
hub_revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
upload_strategy: Optional[str] = UploadStrategy.cancel,
**kwargs):
self.interval = interval
self.save_dir = save_dir
@@ -89,9 +96,9 @@ class CheckpointHook(Hook):
self.hub_token = hub_token
self.private_hub = private_hub
self.hub_revision = hub_revision
self.upload_strategy = upload_strategy
self.tag = -1
self.is_model_id = None
self.push_to_hub_future = None
self.max_checkpoint_num = None
if max_checkpoint_num is not None:
self.max_checkpoint_num = max(int(max_checkpoint_num), 1)
@@ -149,13 +156,15 @@ class CheckpointHook(Hook):
f'Saving checkpoint at {trainer.iter + 1} iter')
self._save_checkpoint(trainer, prefix)
if is_master() and self.push_to_hub:
if self.push_to_hub_future is not None and not self.push_to_hub_future.done(
):
self.logger.error(
f'Another uploading is running, '
f'this uploading with message {prefix} will be canceled.')
return
self.push_to_hub_future = self._push_to_hub(trainer, prefix)
if self.upload_strategy == UploadStrategy.cancel:
output_dir = self.output_dir
delete_dir = False
else:
output_dir = self.output_dir + '_upload_' + prefix
shutil.copytree(
self.output_dir, output_dir, dirs_exist_ok=True)
delete_dir = True
self._push_to_hub(trainer, prefix, output_dir, delete_dir)
def after_train_epoch(self, trainer):
if self.save_strategy != CheckpointStrategy.by_epoch:
@@ -172,32 +181,36 @@ class CheckpointHook(Hook):
self._do_save(trainer, CheckpointStrategy.by_step)
def after_run(self, trainer):
if self.push_to_hub_future is not None and not self.push_to_hub_future.done(
):
self.logger.info('Train finished. Uploading models, waiting...')
while not self.push_to_hub_future.done():
time.sleep(1)
self.logger.info('Uploading models done.')
self.logger.info('Train finished. Uploading models, waiting...')
push_to_hub_in_queue(
self.PUSH_TO_HUB_QUEUE_NAME,
strategy=self.upload_strategy,
done=True)
wait_for_done(self.PUSH_TO_HUB_QUEUE_NAME)
self.logger.info('Uploading models done.')
def _push_to_hub(self, trainer, prefix):
def _push_to_hub(self, trainer, prefix, output_dir, delete_dir=False):
if self.is_model_id is None:
self.is_model_id = check_model_is_id(trainer.input_model_id,
self.hub_token)
self.tag += 1
return push_to_hub_async(
self.hub_repo_id,
self.output_dir,
return push_to_hub_in_queue(
self.PUSH_TO_HUB_QUEUE_NAME,
strategy=self.upload_strategy,
repo_name=self.hub_repo_id,
output_dir=output_dir,
token=self.hub_token,
private=self.private_hub,
commit_message=prefix,
tag=f'v1.{self.tag}',
revision=self.hub_revision,
source_repo=trainer.input_model_id if self.is_model_id else '')
source_repo=trainer.input_model_id if self.is_model_id else '',
delete_dir=delete_dir)
def save_evaluate_results(self, trainer):
with open(os.path.join(self.output_dir, self.EVAL_RESULT_FILE),
'w') as f:
f.write(str(trainer.metric_values))
f.write(json.dumps(trainer.metric_values))
def _save_checkpoint(self, trainer, prefix):
"""Save checkpoint files and remove obsolete ones

View File

@@ -9,6 +9,7 @@ import json
from modelscope.trainers.cli_argument_parser import CliArgumentParser
from modelscope.utils.config import Config
from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE
def set_flatten_value(values: Union[str, List[str]]):
@@ -62,13 +63,13 @@ class DatasetArgs:
})
train_dataset_namespace: str = field(
default=None,
default=DEFAULT_DATASET_NAMESPACE,
metadata={
'help': 'The dataset namespace used for training',
})
val_dataset_namespace: str = field(
default=None,
default=DEFAULT_DATASET_NAMESPACE,
metadata={
'help': 'The dataset namespace used for evaluating',
})
@@ -450,7 +451,7 @@ class TrainingArgs(DatasetArgs, TrainArgs, ModelArgs):
_unknown[unknown[i].replace('-', '')] = parse_value(unknown[i + 1])
args_dict = vars(args)
self.manual_args += parser.manual_args
self._unknown_args.update(_unknown)
for key, value in deepcopy(args_dict).items():
if key is not None and hasattr(self, key):
setattr(self, key, value)
@@ -506,7 +507,7 @@ def build_dataset_from_file(filename):
"text2": "sequence2",
"label": "label",
}
"split": 0.8,
"usage": 0.8,
}
]
"""
@@ -540,16 +541,16 @@ def build_dataset_from_file(filename):
lambda x: x,
remove_columns=remove_columns,
features=new_features).rename_columns(ds['column_mapping'])
split = ds['split']
if isinstance(split, str):
assert split in ('train', 'val')
if split == 'train':
usage = ds['usage']
if isinstance(usage, str):
assert usage in ('train', 'val')
if usage == 'train':
train_set.append(dataset)
else:
eval_set.append(dataset)
else:
assert isinstance(split, float) and 0 < split < 1
ds_dict = dataset.train_test_split(train_size=split)
assert isinstance(usage, float) and 0 < usage < 1
ds_dict = dataset.train_test_split(train_size=usage)
train_set.append(ds_dict['train'])
eval_set.append(ds_dict['test'])

File diff suppressed because one or more lines are too long

View File

@@ -171,7 +171,21 @@ def import_plugins(plugins: List[str] = None) -> List[str]:
for module_name in plugins:
try:
import_module_and_submodules(module_name)
# TODO: include and exclude should be configurable, hard code now
import_module_and_submodules(
module_name,
include={
'easycv.toolkit.modelscope',
'easycv.hooks',
'easycv.models',
'easycv.core',
'easycv.toolkit',
'easycv.predictors',
},
exclude={
'easycv.toolkit.*',
'easycv.*',
})
logger.info('Plugin %s available', module_name)
imported_plugins.append(module_name)
except ModuleNotFoundError as e:
@@ -238,9 +252,10 @@ def import_module_and_submodules(package_name: str,
path_string = '' if not path else path[0]
# walk_packages only finds immediate children, so need to recurse.
for module_finder, name, _ in pkgutil.walk_packages(path):
for module_finder, name, _ in pkgutil.iter_modules(path):
# Sometimes when you import third-party libraries that are on your path,
# `pkgutil.walk_packages` returns those too, so we need to skip them.
# `pkgutil.iter_modules` avoid import those package
if path_string and module_finder.path != path_string: # type: ignore[union-attr]
continue
if name.startswith('_'):
@@ -250,7 +265,8 @@ def import_module_and_submodules(package_name: str,
# skip tests
continue
subpackage = f'{package_name}.{name}'
import_module_and_submodules(subpackage, exclude=exclude)
import_module_and_submodules(
subpackage, include=include, exclude=exclude)
except SystemExit as e:
# this case is specific for easy_cv's tools/predict.py exit
logger.warning(

View File

@@ -1,5 +1,5 @@
# Make sure to modify __release_datetime__ to release time when making official release.
__version__ = '1.6.0'
__version__ = '1.6.1'
# default release datetime for branches under active development is set
# to be a time far-far-away-into-the-future
__release_datetime__ = '2023-05-18 23:59:00'

View File

@@ -27,6 +27,15 @@ class TestExportObjectDetectionDamoyolo(unittest.TestCase):
Exporter.from_model(model).export_onnx(
input_shape=(1, 3, 640, 640), output_dir=self.tmp_dir)
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_export_domain_specific_object_detection_damoyolo(self):
model_id = 'damo/cv_tinynas_human-detection_damoyolo'
model = Model.from_pretrained(model_id)
with tempfile.TemporaryDirectory() as tmp_dir:
Exporter.from_model(model).export_onnx(
input_shape=(1, 3, 640, 640), output_dir=tmp_dir)
if __name__ == '__main__':
unittest.main()

View File

@@ -241,8 +241,7 @@ class MsDatasetTest(unittest.TestCase):
use_streaming=True)
data_example = next(iter(dataset))
print(data_example)
assert isinstance(data_example['Noisy Image:FILE:Object'],
PngImageFile)
assert data_example.values()
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_to_ms_dataset(self):

View File

@@ -1,4 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import unittest
import json
@@ -21,7 +22,7 @@ class TestCli(unittest.TestCase):
'sentence2': 'sentence2',
'label': 'label',
},
'split': 0.8,
'usage': 0.8,
}, {
'dataset': {
'dataset_name': 'glue',
@@ -33,11 +34,15 @@ class TestCli(unittest.TestCase):
'hypothesis': 'sentence2',
'label': 'label',
},
'split': 'val',
'usage': 'val',
}]
with open('./dataset.json', 'w') as f:
json.dump(content, f)
def tearDown(self) -> None:
if os.path.exists('./dataset.json'):
os.remove('./dataset.json')
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_merge_dataset_from_file(self):
dataset = MsDataset.load('clue', subset_name='cmnli', split='train')