From a91f19ea5412a0e18bc3f9c227e12126aa05a67b Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:57:59 +0800 Subject: [PATCH] Support downloading exact file for hf wrapper (#1323) --- modelscope/hub/api.py | 3 +- modelscope/preprocessors/base.py | 5 +- modelscope/utils/hf_util/patcher.py | 90 ++++++++++++++++++++++------- tests/utils/test_hf_util.py | 12 ++++ 4 files changed, 86 insertions(+), 24 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index b1889b23..18a9847c 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -2058,7 +2058,8 @@ class HubApi: if query_addr: domain_response = send_request(query_addr, timeout=internal_timeout) - region_id = domain_response.text.strip() + if domain_response is not None: + region_id = domain_response.text.strip() return region_id diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index 2fbc78af..d3138a3e 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -326,9 +326,10 @@ class Preprocessor(ABC): ) return None if (model_type, task) not in PREPROCESSOR_MAP: - logger.warning( + logger.info( f'No preprocessor key {(model_type, task)} found in PREPROCESSOR_MAP, ' - f'skip building preprocessor.') + f'skip building preprocessor. If the pipeline runs normally, please ignore this log.' + ) return None sub_cfg = ConfigDict({ diff --git a/modelscope/utils/hf_util/patcher.py b/modelscope/utils/hf_util/patcher.py index c1b4f54f..0174cf93 100644 --- a/modelscope/utils/hf_util/patcher.py +++ b/modelscope/utils/hf_util/patcher.py @@ -220,13 +220,60 @@ def _patch_pretrained_class(all_imported_modules, wrap=False): The wrapped class """ + @contextlib.contextmanager + def file_pattern_context(kwargs, module_class, cls): + if 'allow_file_pattern' not in kwargs: + kwargs['allow_file_pattern'] = allow_file_pattern + if 'ignore_file_pattern' not in kwargs: + kwargs['ignore_file_pattern'] = ignore_file_pattern + + if kwargs.get( + 'allow_file_pattern') is None and module_class is not None: + extra_allow_file_pattern = None + if 'GenerationConfig' == module_class.__name__: + from transformers.utils import GENERATION_CONFIG_NAME + extra_allow_file_pattern = [ + GENERATION_CONFIG_NAME, r'*.py' + ] + elif 'Config' in module_class.__name__: + from transformers import CONFIG_NAME + extra_allow_file_pattern = [CONFIG_NAME, r'*.py'] + elif 'Tokenizer' in module_class.__name__: + from transformers.tokenization_utils import ADDED_TOKENS_FILE + from transformers.tokenization_utils import SPECIAL_TOKENS_MAP_FILE + from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE + from transformers.tokenization_utils_base import FULL_TOKENIZER_FILE + from transformers.tokenization_utils_base import CHAT_TEMPLATE_FILE + extra_allow_file_pattern = list( + (cls.vocab_files_names.values()) if cls is not None + and hasattr(cls, 'vocab_files_names') else []) + [ + ADDED_TOKENS_FILE, SPECIAL_TOKENS_MAP_FILE, + TOKENIZER_CONFIG_FILE, FULL_TOKENIZER_FILE, + CHAT_TEMPLATE_FILE, r'*.py' + ] # noqa + elif 'Processor' in module_class.__name__: + from transformers.utils import FEATURE_EXTRACTOR_NAME + from transformers.utils import PROCESSOR_NAME + from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE + extra_allow_file_pattern = [ + FEATURE_EXTRACTOR_NAME, TOKENIZER_CONFIG_FILE, + PROCESSOR_NAME, r'*.py' + ] + + kwargs['allow_file_pattern'] = extra_allow_file_pattern + yield + kwargs.pop('ignore_file_pattern', None) + kwargs.pop('allow_file_pattern', None) + def from_pretrained(model, model_id, *model_args, **kwargs): - # model is an instance - model_dir = get_model_dir( - model_id, - ignore_file_pattern=ignore_file_pattern, - allow_file_pattern=allow_file_pattern, - **kwargs) + + with file_pattern_context(kwargs): + # model is an instance + model_dir = get_model_dir( + model_id, + module_class=module_class, + cls=module_class, + **kwargs) module_obj = module_class.from_pretrained(model, model_dir, *model_args, **kwargs) @@ -238,11 +285,9 @@ def _patch_pretrained_class(all_imported_modules, wrap=False): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - model_dir = get_model_dir( - pretrained_model_name_or_path, - ignore_file_pattern=ignore_file_pattern, - allow_file_pattern=allow_file_pattern, - **kwargs) + with file_pattern_context(kwargs, module_class, cls): + model_dir = get_model_dir(pretrained_model_name_or_path, + **kwargs) module_obj = module_class.from_pretrained( model_dir, *model_args, **kwargs) @@ -253,22 +298,25 @@ def _patch_pretrained_class(all_imported_modules, wrap=False): @classmethod def _get_peft_type(cls, model_id, **kwargs): - model_dir = get_model_dir( - model_id, - ignore_file_pattern=ignore_file_pattern, - allow_file_pattern=allow_file_pattern, - **kwargs) + with file_pattern_context(kwargs, module_class, cls): + model_dir = get_model_dir( + model_id, + ignore_file_pattern=ignore_file_pattern, + allow_file_pattern=allow_file_pattern, + **kwargs) + module_obj = module_class._get_peft_type(model_dir, **kwargs) return module_obj @classmethod def get_config_dict(cls, pretrained_model_name_or_path, *model_args, **kwargs): - model_dir = get_model_dir( - pretrained_model_name_or_path, - ignore_file_pattern=ignore_file_pattern, - allow_file_pattern=allow_file_pattern, - **kwargs) + with file_pattern_context(kwargs, module_class, cls): + model_dir = get_model_dir( + pretrained_model_name_or_path, + ignore_file_pattern=ignore_file_pattern, + allow_file_pattern=allow_file_pattern, + **kwargs) module_obj = module_class.get_config_dict( model_dir, *model_args, **kwargs) diff --git a/tests/utils/test_hf_util.py b/tests/utils/test_hf_util.py index f280ee35..a11f2893 100644 --- a/tests/utils/test_hf_util.py +++ b/tests/utils/test_hf_util.py @@ -91,6 +91,18 @@ class HFUtilTest(unittest.TestCase): revision='v1.0.3') self.assertEqual(gen_config.assistant_token_id, 196) + def test_qwen_tokenizer(self): + from modelscope import Qwen2Tokenizer + tokenizer = Qwen2Tokenizer.from_pretrained( + 'Qwen/Qwen2-Math-7B-Instruct') + self.assertTrue(tokenizer is not None) + + def test_extra_ignore_args(self): + from modelscope import Qwen2Tokenizer + tokenizer = Qwen2Tokenizer.from_pretrained( + 'Qwen/Qwen2-Math-7B-Instruct', ignore_file_pattern=[r'\w+\.h5']) + self.assertTrue(tokenizer is not None) + def test_transformer_patch(self): with patch_context(): from transformers import AutoTokenizer, AutoModelForCausalLM