Support downloading exact file for hf wrapper (#1323)

2025-12-16 16:27:45 +01:00 · 2025-04-30 14:57:59 +08:00
parent 806ac2b05e
commit a91f19ea54
4 changed files with 86 additions and 24 deletions
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -2058,7 +2058,8 @@ class HubApi:
            if query_addr:
                domain_response = send_request(query_addr, timeout=internal_timeout)
-                region_id = domain_response.text.strip()
+                if domain_response is not None:
                    region_id = domain_response.text.strip()
        return region_id
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -326,9 +326,10 @@ class Preprocessor(ABC):
                )
                return None
            if (model_type, task) not in PREPROCESSOR_MAP:
-                logger.warning(
+                logger.info(
                    f'No preprocessor key {(model_type, task)} found in PREPROCESSOR_MAP, '
-                    f'skip building preprocessor.')
+                    f'skip building preprocessor. If the pipeline runs normally, please ignore this log.'
                )
                return None
            sub_cfg = ConfigDict({
--- a/modelscope/utils/hf_util/patcher.py
+++ b/modelscope/utils/hf_util/patcher.py
@@ -220,13 +220,60 @@ def _patch_pretrained_class(all_imported_modules, wrap=False):
            The wrapped class
        """
        @contextlib.contextmanager
        def file_pattern_context(kwargs, module_class, cls):
            if 'allow_file_pattern' not in kwargs:
                kwargs['allow_file_pattern'] = allow_file_pattern
            if 'ignore_file_pattern' not in kwargs:
                kwargs['ignore_file_pattern'] = ignore_file_pattern
            if kwargs.get(
                    'allow_file_pattern') is None and module_class is not None:
                extra_allow_file_pattern = None
                if 'GenerationConfig' == module_class.__name__:
                    from transformers.utils import GENERATION_CONFIG_NAME
                    extra_allow_file_pattern = [
                        GENERATION_CONFIG_NAME, r'*.py'
                    ]
                elif 'Config' in module_class.__name__:
                    from transformers import CONFIG_NAME
                    extra_allow_file_pattern = [CONFIG_NAME, r'*.py']
                elif 'Tokenizer' in module_class.__name__:
                    from transformers.tokenization_utils import ADDED_TOKENS_FILE
                    from transformers.tokenization_utils import SPECIAL_TOKENS_MAP_FILE
                    from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
                    from transformers.tokenization_utils_base import FULL_TOKENIZER_FILE
                    from transformers.tokenization_utils_base import CHAT_TEMPLATE_FILE
                    extra_allow_file_pattern = list(
                        (cls.vocab_files_names.values()) if cls is not None
                        and hasattr(cls, 'vocab_files_names') else []) + [
                            ADDED_TOKENS_FILE, SPECIAL_TOKENS_MAP_FILE,
                            TOKENIZER_CONFIG_FILE, FULL_TOKENIZER_FILE,
                            CHAT_TEMPLATE_FILE, r'*.py'
                        ]  # noqa
                elif 'Processor' in module_class.__name__:
                    from transformers.utils import FEATURE_EXTRACTOR_NAME
                    from transformers.utils import PROCESSOR_NAME
                    from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
                    extra_allow_file_pattern = [
                        FEATURE_EXTRACTOR_NAME, TOKENIZER_CONFIG_FILE,
                        PROCESSOR_NAME, r'*.py'
                    ]
                kwargs['allow_file_pattern'] = extra_allow_file_pattern
            yield
            kwargs.pop('ignore_file_pattern', None)
            kwargs.pop('allow_file_pattern', None)
        def from_pretrained(model, model_id, *model_args, **kwargs):
-            # model is an instance
+
-            model_dir = get_model_dir(
+            with file_pattern_context(kwargs):
-                model_id,
+                # model is an instance
-                ignore_file_pattern=ignore_file_pattern,
+                model_dir = get_model_dir(
-                allow_file_pattern=allow_file_pattern,
+                    model_id,
-                **kwargs)
+                    module_class=module_class,
                    cls=module_class,
                    **kwargs)
            module_obj = module_class.from_pretrained(model, model_dir,
                                                      *model_args, **kwargs)
@@ -238,11 +285,9 @@ def _patch_pretrained_class(all_imported_modules, wrap=False):
            @classmethod
            def from_pretrained(cls, pretrained_model_name_or_path,
                                *model_args, **kwargs):
-                model_dir = get_model_dir(
+                with file_pattern_context(kwargs, module_class, cls):
-                    pretrained_model_name_or_path,
+                    model_dir = get_model_dir(pretrained_model_name_or_path,
-                    ignore_file_pattern=ignore_file_pattern,
+                                              **kwargs)
                    allow_file_pattern=allow_file_pattern,
                    **kwargs)
                module_obj = module_class.from_pretrained(
                    model_dir, *model_args, **kwargs)
@@ -253,22 +298,25 @@ def _patch_pretrained_class(all_imported_modules, wrap=False):
            @classmethod
            def _get_peft_type(cls, model_id, **kwargs):
-                model_dir = get_model_dir(
+                with file_pattern_context(kwargs, module_class, cls):
-                    model_id,
+                    model_dir = get_model_dir(
-                    ignore_file_pattern=ignore_file_pattern,
+                        model_id,
-                    allow_file_pattern=allow_file_pattern,
+                        ignore_file_pattern=ignore_file_pattern,
-                    **kwargs)
+                        allow_file_pattern=allow_file_pattern,
                        **kwargs)
                module_obj = module_class._get_peft_type(model_dir, **kwargs)
                return module_obj
            @classmethod
            def get_config_dict(cls, pretrained_model_name_or_path,
                                *model_args, **kwargs):
-                model_dir = get_model_dir(
+                with file_pattern_context(kwargs, module_class, cls):
-                    pretrained_model_name_or_path,
+                    model_dir = get_model_dir(
-                    ignore_file_pattern=ignore_file_pattern,
+                        pretrained_model_name_or_path,
-                    allow_file_pattern=allow_file_pattern,
+                        ignore_file_pattern=ignore_file_pattern,
-                    **kwargs)
+                        allow_file_pattern=allow_file_pattern,
                        **kwargs)
                module_obj = module_class.get_config_dict(
                    model_dir, *model_args, **kwargs)
--- a/tests/utils/test_hf_util.py
+++ b/tests/utils/test_hf_util.py
@@ -91,6 +91,18 @@ class HFUtilTest(unittest.TestCase):
            revision='v1.0.3')
        self.assertEqual(gen_config.assistant_token_id, 196)
    def test_qwen_tokenizer(self):
        from modelscope import Qwen2Tokenizer
        tokenizer = Qwen2Tokenizer.from_pretrained(
            'Qwen/Qwen2-Math-7B-Instruct')
        self.assertTrue(tokenizer is not None)
    def test_extra_ignore_args(self):
        from modelscope import Qwen2Tokenizer
        tokenizer = Qwen2Tokenizer.from_pretrained(
            'Qwen/Qwen2-Math-7B-Instruct', ignore_file_pattern=[r'\w+\.h5'])
        self.assertTrue(tokenizer is not None)
    def test_transformer_patch(self):
        with patch_context():
            from transformers import AutoTokenizer, AutoModelForCausalLM