From bd1544bef0945677bfd6c0ac2b24f353f2f0817d Mon Sep 17 00:00:00 2001 From: suluyana <110878454+suluyana@users.noreply.github.com> Date: Thu, 10 Oct 2024 10:01:45 +0800 Subject: [PATCH 1/3] fix template: llava-llama-3 & yi-1.5 (#1011) --- modelscope/preprocessors/templates/loader.py | 6 +++++- tests/tools/test_to_ollama.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py index e3ed9f89..e286802b 100644 --- a/modelscope/preprocessors/templates/loader.py +++ b/modelscope/preprocessors/templates/loader.py @@ -83,7 +83,7 @@ template_info = [ TemplateInfo( template=TemplateType.chatml, template_regex= - f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*{chat_suffix}.*', + f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*', modelfile_link= 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-1.5.modelfile', ), @@ -110,6 +110,10 @@ template_info = [ 'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/glm4.modelfile', ), + TemplateInfo( + template_regex=f'.*{cases("llava-llama-3")}.*', + modelfile_link='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava-llama-3.modelfile'), + # baichuan TemplateInfo( template=TemplateType.baichuan, diff --git a/tests/tools/test_to_ollama.py b/tests/tools/test_to_ollama.py index aaf5f4d0..ba92c1ea 100644 --- a/tests/tools/test_to_ollama.py +++ b/tests/tools/test_to_ollama.py @@ -100,6 +100,11 @@ class TestToOllama(unittest.TestCase): ollama = TemplateLoader.to_ollama( 'QuantFactory/Mistral-Nemo-Japanese-Instruct-2408-GGUF') self.assertTrue(ollama is not None) + ollama = TemplateLoader.to_ollama('AI-ModelScope/Yi-1.5-9B-32K-GGUF') + self.assertTrue(ollama is not None) + ollama = TemplateLoader.to_ollama( + 'AI-ModelScope/llava-llama-3-8b-v1_1-gguf') + self.assertTrue(ollama is not None) if __name__ == '__main__': From b0f03ffd6d7d82133d47027291b23aae6d576d55 Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Thu, 10 Oct 2024 21:40:51 +0800 Subject: [PATCH 2/3] Fix template of eos_token (#1013) --- modelscope/preprocessors/templates/loader.py | 53 ++++++++++++++------ tests/tools/test_to_ollama.py | 3 ++ 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py index e286802b..4f83a9a9 100644 --- a/modelscope/preprocessors/templates/loader.py +++ b/modelscope/preprocessors/templates/loader.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List import requests -from modelscope import AutoTokenizer, get_logger, snapshot_download +from modelscope import AutoTokenizer, get_logger, snapshot_download, AutoConfig from . import TemplateType from .base import Template, get_template @@ -262,6 +262,7 @@ class TemplateLoader: """ ignore_file_pattern = [r'.+\.bin$', r'.+\.safetensors$', r'.+\.gguf$'] tokenizer = kwargs.get('tokenizer') + config = kwargs.get('config') for _info in template_info: if re.fullmatch(_info.template_regex, model_id): if _info.template: @@ -273,10 +274,11 @@ class TemplateLoader: ignore_file_pattern=ignore_file_pattern) tokenizer = AutoTokenizer.from_pretrained( model_dir, trust_remote_code=True) + config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) except Exception: pass return TemplateLoader.load_by_template_name( - _info.template, tokenizer=tokenizer, **kwargs) + _info.template, tokenizer=tokenizer, config=config, **kwargs) @staticmethod def load_by_template_name(template_name: str, **kwargs) -> Template: @@ -292,7 +294,9 @@ class TemplateLoader: Returns: The template instance """ - return get_template(template_name, tokenizer=kwargs.pop('tokenizer', None), **kwargs) + template = get_template(template_name, tokenizer=kwargs.pop('tokenizer', None), **kwargs) + template.config = kwargs.get('config') + return template @staticmethod def replace_and_concat(template: Template, template_list: List, @@ -330,33 +334,41 @@ class TemplateLoader: Returns: The ModelFile content, returns `None` if no template found """ + if not model_id and not template_name: + raise ValueError( + f'Please make sure you model_id: {model_id} ' + f'and template_name: {template_name} is supported.') logger.info('Exporting to ollama:') if model_id: for _info in template_info: if re.fullmatch(_info.template_regex, model_id): - if _info.modelfile_link: + if _info.modelfile_link and not kwargs.get('ignore_oss_model_file', False): return TemplateLoader._read_content_from_url( _info.modelfile_link) - elif _info.template and not template_name: - template_name = _info.template if template_name: template = TemplateLoader.load_by_template_name( template_name, **kwargs) else: - raise ValueError( - f'Please make sure you model_id: {model_id} ' - f'and template_name: {template_name} is supported.') + template = TemplateLoader.load_by_model_id( + model_id, **kwargs) if template is None: return None content = '' - content += 'FROM {{gguf_file}}\n' - content += ( - f'TEMPLATE """{{{{ if .System }}}}' - f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}' - f'{{{{ else }}}}{TemplateLoader.replace_and_concat(template, template.prefix, "", "")}' - f'{{{{ end }}}}') + content += 'FROM {gguf_file}\n' + _prefix = TemplateLoader.replace_and_concat(template, template.prefix, "", "") + if _prefix: + content += ( + f'TEMPLATE """{{{{ if .System }}}}' + f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}' + f'{{{{ else }}}}{_prefix}' + f'{{{{ end }}}}') + else: + content += ( + f'TEMPLATE """{{{{ if .System }}}}' + f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}' + f'{{{{ end }}}}') content += ( f'{{{{ if .Prompt }}}}' f'{TemplateLoader.replace_and_concat(template, template.prompt, "{{QUERY}}", "{{ .Prompt }}")}' @@ -364,7 +376,16 @@ class TemplateLoader: content += '{{ .Response }}' content += TemplateLoader.replace_and_concat(template, template.suffix, '', '') + '"""\n' - content += f'PARAMETER stop "{TemplateLoader.replace_and_concat(template, template.suffix, "", "")}"\n' + all_eos_tokens = {TemplateLoader.replace_and_concat(template, template.suffix, "", "")} + if getattr(template, 'tokenizer', None): + eos_token = TemplateLoader.replace_and_concat(template, [["eos_token_id"]], "", "") + all_eos_tokens.add(eos_token) + if getattr(template, 'config', None) and getattr(template.config, 'eos_token_id'): + eos_token_id = template.config.eos_token_id + eos_token = TemplateLoader.replace_and_concat(template, [[eos_token_id]], "", "") + all_eos_tokens.add(eos_token) + for eos_token in all_eos_tokens: + content += f'PARAMETER stop "{eos_token}"\n' return content @staticmethod diff --git a/tests/tools/test_to_ollama.py b/tests/tools/test_to_ollama.py index ba92c1ea..ad7a3e87 100644 --- a/tests/tools/test_to_ollama.py +++ b/tests/tools/test_to_ollama.py @@ -105,6 +105,9 @@ class TestToOllama(unittest.TestCase): ollama = TemplateLoader.to_ollama( 'AI-ModelScope/llava-llama-3-8b-v1_1-gguf') self.assertTrue(ollama is not None) + ollama = TemplateLoader.to_ollama( + '01ai/Yi-1.5-9B-Chat', ignore_oss_model_file=True) + self.assertTrue(ollama is not None) if __name__ == '__main__': From 90acaccc28887c69586240bb6469cde2d7858b7c Mon Sep 17 00:00:00 2001 From: "Xingjun.Wang" Date: Sat, 12 Oct 2024 22:00:30 +0800 Subject: [PATCH 3/3] Fix trust_remote_code (#1016) * add cmd line option of clear-cache * fix typo * fix trust_remote_code for old dataset and py-script --------- Co-authored-by: Yingda Chen --- modelscope/msdatasets/context/dataset_context_config.py | 3 ++- modelscope/msdatasets/data_loader/data_loader.py | 3 ++- modelscope/msdatasets/data_loader/data_loader_manager.py | 2 ++ modelscope/msdatasets/ms_dataset.py | 6 +++++- modelscope/msdatasets/utils/hf_datasets_util.py | 4 ++-- 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/modelscope/msdatasets/context/dataset_context_config.py b/modelscope/msdatasets/context/dataset_context_config.py index 48124d78..bfe7dbd1 100644 --- a/modelscope/msdatasets/context/dataset_context_config.py +++ b/modelscope/msdatasets/context/dataset_context_config.py @@ -17,7 +17,7 @@ class DatasetContextConfig: data_files: Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]], download_mode: DownloadMode, cache_root_dir: str, - use_streaming: bool, stream_batch_size: int, **kwargs): + use_streaming: bool, stream_batch_size: int, trust_remote_code: bool, **kwargs): self._download_config = None self._data_meta_config = None @@ -44,6 +44,7 @@ class DatasetContextConfig: self.use_streaming = use_streaming self.stream_batch_size = stream_batch_size self.download_virgo_files: bool = False + self.trust_remote_code: bool = trust_remote_code @property def config_kwargs(self) -> dict: diff --git a/modelscope/msdatasets/data_loader/data_loader.py b/modelscope/msdatasets/data_loader/data_loader.py index f29acc8f..92074449 100644 --- a/modelscope/msdatasets/data_loader/data_loader.py +++ b/modelscope/msdatasets/data_loader/data_loader.py @@ -127,6 +127,7 @@ class OssDownloader(BaseDownloader): cache_dir = self.dataset_context_config.cache_root_dir download_mode = self.dataset_context_config.download_mode input_kwargs = self.dataset_context_config.config_kwargs + trust_remote_code = self.dataset_context_config.trust_remote_code if self.builder is None and not dataset_py_script: raise f'meta-file: {dataset_name}.py not found on the modelscope hub.' @@ -141,7 +142,7 @@ class OssDownloader(BaseDownloader): data_files=data_files, cache_dir=cache_dir, download_mode=download_mode.value, - ignore_verifications=True, + trust_remote_code=trust_remote_code, **input_kwargs) else: self.dataset = self.data_files_manager.fetch_data_files( diff --git a/modelscope/msdatasets/data_loader/data_loader_manager.py b/modelscope/msdatasets/data_loader/data_loader_manager.py index 0dec5d89..504f3da6 100644 --- a/modelscope/msdatasets/data_loader/data_loader_manager.py +++ b/modelscope/msdatasets/data_loader/data_loader_manager.py @@ -105,6 +105,7 @@ class RemoteDataLoaderManager(DataLoaderManager): download_mode_val = self.dataset_context_config.download_mode.value use_streaming = self.dataset_context_config.use_streaming input_config_kwargs = self.dataset_context_config.config_kwargs + trust_remote_code = self.dataset_context_config.trust_remote_code # To use the huggingface data loader if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER: @@ -118,6 +119,7 @@ class RemoteDataLoaderManager(DataLoaderManager): download_mode=download_mode_val, streaming=use_streaming, ignore_verifications=True, + trust_remote_code=trust_remote_code, **input_config_kwargs) # download statistics self.api.dataset_download_statistics( diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index b57a16ac..899142ad 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -168,6 +168,7 @@ class MsDataset: custom_cfg: Optional[Config] = Config(), token: Optional[str] = None, dataset_info_only: Optional[bool] = False, + trust_remote_code: Optional[bool] = True, **config_kwargs, ) -> Union[dict, 'MsDataset', NativeIterableDataset]: """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. @@ -198,6 +199,7 @@ class MsDataset: see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3 token (str, Optional): SDK token of ModelScope. dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict). + trust_remote_code (bool, Optional): If set to True, trust the remote code. **config_kwargs (additional keyword arguments): Keyword arguments to be passed Returns: @@ -250,6 +252,7 @@ class MsDataset: cache_root_dir=cache_dir, use_streaming=use_streaming, stream_batch_size=stream_batch_size, + trust_remote_code=trust_remote_code, **config_kwargs) # Load from local disk @@ -275,6 +278,7 @@ class MsDataset: split=split, streaming=use_streaming, download_mode=download_mode.value, + trust_remote_code=trust_remote_code, **config_kwargs) # Load from the modelscope hub @@ -303,7 +307,7 @@ class MsDataset: token=token, streaming=use_streaming, dataset_info_only=dataset_info_only, - trust_remote_code=True, + trust_remote_code=trust_remote_code, **config_kwargs) as dataset_res: return dataset_res diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index 5b3a8bb7..3fb996ac 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -824,7 +824,7 @@ def get_module_with_script(self) -> DatasetModule: name=self.name, ) if not os.path.exists(importable_file_path): - trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name) + trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name) if trust_remote_code: _create_importable_file( local_path=local_script_path, @@ -884,7 +884,7 @@ class DatasetsWrapperHF: streaming: bool = False, num_proc: Optional[int] = None, storage_options: Optional[Dict] = None, - trust_remote_code: bool = None, + trust_remote_code: bool = True, dataset_info_only: Optional[bool] = False, **config_kwargs, ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset,