From a91f19ea5412a0e18bc3f9c227e12126aa05a67b Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Wed, 30 Apr 2025 14:57:59 +0800
Subject: [PATCH] Support downloading exact file for hf wrapper (#1323)

---
 modelscope/hub/api.py               |  3 +-
 modelscope/preprocessors/base.py    |  5 +-
 modelscope/utils/hf_util/patcher.py | 90 ++++++++++++++++++++++-------
 tests/utils/test_hf_util.py         | 12 ++++
 4 files changed, 86 insertions(+), 24 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index b1889b23..18a9847c 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -2058,7 +2058,8 @@ class HubApi:
 
             if query_addr:
                 domain_response = send_request(query_addr, timeout=internal_timeout)
-                region_id = domain_response.text.strip()
+                if domain_response is not None:
+                    region_id = domain_response.text.strip()
 
         return region_id
 
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index 2fbc78af..d3138a3e 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -326,9 +326,10 @@ class Preprocessor(ABC):
                 )
                 return None
             if (model_type, task) not in PREPROCESSOR_MAP:
-                logger.warning(
+                logger.info(
                     f'No preprocessor key {(model_type, task)} found in PREPROCESSOR_MAP, '
-                    f'skip building preprocessor.')
+                    f'skip building preprocessor. If the pipeline runs normally, please ignore this log.'
+                )
                 return None
 
             sub_cfg = ConfigDict({
diff --git a/modelscope/utils/hf_util/patcher.py b/modelscope/utils/hf_util/patcher.py
index c1b4f54f..0174cf93 100644
--- a/modelscope/utils/hf_util/patcher.py
+++ b/modelscope/utils/hf_util/patcher.py
@@ -220,13 +220,60 @@ def _patch_pretrained_class(all_imported_modules, wrap=False):
             The wrapped class
         """
 
+        @contextlib.contextmanager
+        def file_pattern_context(kwargs, module_class, cls):
+            if 'allow_file_pattern' not in kwargs:
+                kwargs['allow_file_pattern'] = allow_file_pattern
+            if 'ignore_file_pattern' not in kwargs:
+                kwargs['ignore_file_pattern'] = ignore_file_pattern
+
+            if kwargs.get(
+                    'allow_file_pattern') is None and module_class is not None:
+                extra_allow_file_pattern = None
+                if 'GenerationConfig' == module_class.__name__:
+                    from transformers.utils import GENERATION_CONFIG_NAME
+                    extra_allow_file_pattern = [
+                        GENERATION_CONFIG_NAME, r'*.py'
+                    ]
+                elif 'Config' in module_class.__name__:
+                    from transformers import CONFIG_NAME
+                    extra_allow_file_pattern = [CONFIG_NAME, r'*.py']
+                elif 'Tokenizer' in module_class.__name__:
+                    from transformers.tokenization_utils import ADDED_TOKENS_FILE
+                    from transformers.tokenization_utils import SPECIAL_TOKENS_MAP_FILE
+                    from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
+                    from transformers.tokenization_utils_base import FULL_TOKENIZER_FILE
+                    from transformers.tokenization_utils_base import CHAT_TEMPLATE_FILE
+                    extra_allow_file_pattern = list(
+                        (cls.vocab_files_names.values()) if cls is not None
+                        and hasattr(cls, 'vocab_files_names') else []) + [
+                            ADDED_TOKENS_FILE, SPECIAL_TOKENS_MAP_FILE,
+                            TOKENIZER_CONFIG_FILE, FULL_TOKENIZER_FILE,
+                            CHAT_TEMPLATE_FILE, r'*.py'
+                        ]  # noqa
+                elif 'Processor' in module_class.__name__:
+                    from transformers.utils import FEATURE_EXTRACTOR_NAME
+                    from transformers.utils import PROCESSOR_NAME
+                    from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
+                    extra_allow_file_pattern = [
+                        FEATURE_EXTRACTOR_NAME, TOKENIZER_CONFIG_FILE,
+                        PROCESSOR_NAME, r'*.py'
+                    ]
+
+                kwargs['allow_file_pattern'] = extra_allow_file_pattern
+            yield
+            kwargs.pop('ignore_file_pattern', None)
+            kwargs.pop('allow_file_pattern', None)
+
         def from_pretrained(model, model_id, *model_args, **kwargs):
-            # model is an instance
-            model_dir = get_model_dir(
-                model_id,
-                ignore_file_pattern=ignore_file_pattern,
-                allow_file_pattern=allow_file_pattern,
-                **kwargs)
+
+            with file_pattern_context(kwargs):
+                # model is an instance
+                model_dir = get_model_dir(
+                    model_id,
+                    module_class=module_class,
+                    cls=module_class,
+                    **kwargs)
 
             module_obj = module_class.from_pretrained(model, model_dir,
                                                       *model_args, **kwargs)
@@ -238,11 +285,9 @@ def _patch_pretrained_class(all_imported_modules, wrap=False):
             @classmethod
             def from_pretrained(cls, pretrained_model_name_or_path,
                                 *model_args, **kwargs):
-                model_dir = get_model_dir(
-                    pretrained_model_name_or_path,
-                    ignore_file_pattern=ignore_file_pattern,
-                    allow_file_pattern=allow_file_pattern,
-                    **kwargs)
+                with file_pattern_context(kwargs, module_class, cls):
+                    model_dir = get_model_dir(pretrained_model_name_or_path,
+                                              **kwargs)
 
                 module_obj = module_class.from_pretrained(
                     model_dir, *model_args, **kwargs)
@@ -253,22 +298,25 @@ def _patch_pretrained_class(all_imported_modules, wrap=False):
 
             @classmethod
             def _get_peft_type(cls, model_id, **kwargs):
-                model_dir = get_model_dir(
-                    model_id,
-                    ignore_file_pattern=ignore_file_pattern,
-                    allow_file_pattern=allow_file_pattern,
-                    **kwargs)
+                with file_pattern_context(kwargs, module_class, cls):
+                    model_dir = get_model_dir(
+                        model_id,
+                        ignore_file_pattern=ignore_file_pattern,
+                        allow_file_pattern=allow_file_pattern,
+                        **kwargs)
+
                 module_obj = module_class._get_peft_type(model_dir, **kwargs)
                 return module_obj
 
             @classmethod
             def get_config_dict(cls, pretrained_model_name_or_path,
                                 *model_args, **kwargs):
-                model_dir = get_model_dir(
-                    pretrained_model_name_or_path,
-                    ignore_file_pattern=ignore_file_pattern,
-                    allow_file_pattern=allow_file_pattern,
-                    **kwargs)
+                with file_pattern_context(kwargs, module_class, cls):
+                    model_dir = get_model_dir(
+                        pretrained_model_name_or_path,
+                        ignore_file_pattern=ignore_file_pattern,
+                        allow_file_pattern=allow_file_pattern,
+                        **kwargs)
 
                 module_obj = module_class.get_config_dict(
                     model_dir, *model_args, **kwargs)
diff --git a/tests/utils/test_hf_util.py b/tests/utils/test_hf_util.py
index f280ee35..a11f2893 100644
--- a/tests/utils/test_hf_util.py
+++ b/tests/utils/test_hf_util.py
@@ -91,6 +91,18 @@ class HFUtilTest(unittest.TestCase):
             revision='v1.0.3')
         self.assertEqual(gen_config.assistant_token_id, 196)
 
+    def test_qwen_tokenizer(self):
+        from modelscope import Qwen2Tokenizer
+        tokenizer = Qwen2Tokenizer.from_pretrained(
+            'Qwen/Qwen2-Math-7B-Instruct')
+        self.assertTrue(tokenizer is not None)
+
+    def test_extra_ignore_args(self):
+        from modelscope import Qwen2Tokenizer
+        tokenizer = Qwen2Tokenizer.from_pretrained(
+            'Qwen/Qwen2-Math-7B-Instruct', ignore_file_pattern=[r'\w+\.h5'])
+        self.assertTrue(tokenizer is not None)
+
     def test_transformer_patch(self):
         with patch_context():
             from transformers import AutoTokenizer, AutoModelForCausalLM