From 8d1c290acf68ab325787f741beb07e69f5669d8e Mon Sep 17 00:00:00 2001 From: XDUWQ <1300964705@qq.com> Date: Mon, 4 Sep 2023 16:44:25 +0800 Subject: [PATCH 1/3] fix a bug of custom diffusion --- .../multi_modal/custom_diffusion/custom_diffusion_trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modelscope/trainers/multi_modal/custom_diffusion/custom_diffusion_trainer.py b/modelscope/trainers/multi_modal/custom_diffusion/custom_diffusion_trainer.py index 1183c167..a18b546e 100644 --- a/modelscope/trainers/multi_modal/custom_diffusion/custom_diffusion_trainer.py +++ b/modelscope/trainers/multi_modal/custom_diffusion/custom_diffusion_trainer.py @@ -40,7 +40,8 @@ class CustomCheckpointProcessor(CheckpointProcessor): def __init__(self, modifier_token, modifier_token_id, - torch_type=torch.float32): + torch_type=torch.float32, + safe_serialization=False): """Checkpoint processor for custom diffusion. Args: From 5c5f05021b4dc6c99b5936ca1bd628b022ad31ed Mon Sep 17 00:00:00 2001 From: Jintao Date: Wed, 6 Sep 2023 18:24:47 +0800 Subject: [PATCH 2/3] add trust_remote_code note (#522) --- modelscope/utils/automodel_utils.py | 2 +- modelscope/utils/hf_util.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/modelscope/utils/automodel_utils.py b/modelscope/utils/automodel_utils.py index 56075618..afd83817 100644 --- a/modelscope/utils/automodel_utils.py +++ b/modelscope/utils/automodel_utils.py @@ -66,7 +66,7 @@ def try_to_load_hf_model(model_dir: str, task_name: str, if use_hf and automodel_class is None: raise ValueError(f'Model import failed. You used `use_hf={use_hf}`, ' - 'but the model is not a model of hf') + 'but the model is not a model of hf.') model = None if automodel_class is not None: diff --git a/modelscope/utils/hf_util.py b/modelscope/utils/hf_util.py index 2a534eb6..fd367847 100644 --- a/modelscope/utils/hf_util.py +++ b/modelscope/utils/hf_util.py @@ -112,22 +112,29 @@ def check_hf_code(model_dir: str, auto_class: type, # trust_remote_code is False or has_remote_code is False model_type = config_dict.get('model_type', None) if model_type is None: - raise ValueError(f'`model_type` key is not found in {config_path}') + raise ValueError(f'`model_type` key is not found in {config_path}.') + trust_remote_code_info = '.' + if not trust_remote_code: + trust_remote_code_info = ', You can try passing `trust_remote_code=True`.' if auto_class is AutoConfigHF: if model_type not in CONFIG_MAPPING: - raise ValueError(f'{model_type} not found in HF CONFIG_MAPPING') + raise ValueError( + f'{model_type} not found in HF `CONFIG_MAPPING`{trust_remote_code_info}' + ) elif auto_class is AutoTokenizerHF: if model_type not in TOKENIZER_MAPPING_NAMES: raise ValueError( - f'{model_type} not found in HF TOKENIZER_MAPPING_NAMES') + f'{model_type} not found in HF `TOKENIZER_MAPPING_NAMES`{trust_remote_code_info}' + ) else: mapping_names = [ m.model_type for m in auto_class._model_mapping.keys() ] if model_type not in mapping_names: raise ValueError( - f'{model_type} not found in HF auto_class._model_mapping') + f'{model_type} not found in HF `auto_class._model_mapping`{trust_remote_code_info}' + ) def get_wrapped_class(module_class, ignore_file_pattern=[], **kwargs): From 64d24df4d3dd224bc5cf9ab88e15e35b330c228e Mon Sep 17 00:00:00 2001 From: liuyhwangyh Date: Thu, 7 Sep 2023 22:25:59 +0800 Subject: [PATCH 3/3] add download failed retry (#523) * add download failed retry * fix lint issue --- modelscope/hub/constants.py | 2 +- modelscope/hub/file_download.py | 43 +++++++++++++++++++++------------ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py index fb8bd5f5..93d6ae84 100644 --- a/modelscope/hub/constants.py +++ b/modelscope/hub/constants.py @@ -19,7 +19,7 @@ REQUESTS_API_HTTP_METHOD = ['get', 'head', 'post', 'put', 'patch', 'delete'] API_HTTP_CLIENT_TIMEOUT = 60 API_RESPONSE_FIELD_DATA = 'Data' API_FILE_DOWNLOAD_RETRY_TIMES = 5 -API_FILE_DOWNLOAD_TIMEOUT = 60 * 5 +API_FILE_DOWNLOAD_TIMEOUT = 30 API_FILE_DOWNLOAD_CHUNK_SIZE = 1024 * 1024 * 16 API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken' API_RESPONSE_FIELD_USERNAME = 'Username' diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index cb95e164..f2ec1127 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -187,23 +187,36 @@ def get_file_download_url(model_id: str, file_path: str, revision: str): ) -def download_part(params): +def download_part_with_retry(params): # unpack parameters progress, start, end, url, file_name, cookies, headers = params get_headers = {} if headers is None else copy.deepcopy(headers) get_headers['Range'] = 'bytes=%s-%s' % (start, end) - with open(file_name, 'rb+') as f: - f.seek(start) - r = requests.get( - url, - stream=True, - headers=get_headers, - cookies=cookies, - timeout=API_FILE_DOWNLOAD_TIMEOUT) - for chunk in r.iter_content(chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - progress.update(len(chunk)) + retry = Retry( + total=API_FILE_DOWNLOAD_RETRY_TIMES, + backoff_factor=1, + allowed_methods=['GET']) + while True: + try: + with open(file_name, 'rb+') as f: + f.seek(start) + r = requests.get( + url, + stream=True, + headers=get_headers, + cookies=cookies, + timeout=API_FILE_DOWNLOAD_TIMEOUT) + for chunk in r.iter_content( + chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + progress.update(end - start) + break + except (Exception) as e: # no matter what exception, we will retry. + retry = retry.increment('GET', url, error=e) + logger.warning('Download file from: %s to: %s failed, will retry' % + (start, end)) + retry.sleep() def parallel_download( @@ -226,7 +239,7 @@ def parallel_download( initial=0, desc='Downloading', ) - PART_SIZE = 160 * 1024 * 1012 # every part is 160M + PART_SIZE = 160 * 1024 * 1024 # every part is 160M tasks = [] for idx in range(int(file_size / PART_SIZE)): start = idx * PART_SIZE @@ -240,7 +253,7 @@ def parallel_download( with ThreadPoolExecutor( max_workers=parallels, thread_name_prefix='download') as executor: - list(executor.map(download_part, tasks)) + list(executor.map(download_part_with_retry, tasks)) progress.close()