ok Merge branch 'master' of github.com:modelscope/modelscope into release/1.19

This commit is contained in:
xingjun.wang
2024-10-12 22:01:32 +08:00
7 changed files with 63 additions and 22 deletions

View File

@@ -17,7 +17,7 @@ class DatasetContextConfig:
data_files: Union[str, Sequence[str],
Mapping[str, Union[str, Sequence[str]]]],
download_mode: DownloadMode, cache_root_dir: str,
use_streaming: bool, stream_batch_size: int, **kwargs):
use_streaming: bool, stream_batch_size: int, trust_remote_code: bool, **kwargs):
self._download_config = None
self._data_meta_config = None
@@ -44,6 +44,7 @@ class DatasetContextConfig:
self.use_streaming = use_streaming
self.stream_batch_size = stream_batch_size
self.download_virgo_files: bool = False
self.trust_remote_code: bool = trust_remote_code
@property
def config_kwargs(self) -> dict:

View File

@@ -127,6 +127,7 @@ class OssDownloader(BaseDownloader):
cache_dir = self.dataset_context_config.cache_root_dir
download_mode = self.dataset_context_config.download_mode
input_kwargs = self.dataset_context_config.config_kwargs
trust_remote_code = self.dataset_context_config.trust_remote_code
if self.builder is None and not dataset_py_script:
raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
@@ -141,7 +142,7 @@ class OssDownloader(BaseDownloader):
data_files=data_files,
cache_dir=cache_dir,
download_mode=download_mode.value,
ignore_verifications=True,
trust_remote_code=trust_remote_code,
**input_kwargs)
else:
self.dataset = self.data_files_manager.fetch_data_files(

View File

@@ -105,6 +105,7 @@ class RemoteDataLoaderManager(DataLoaderManager):
download_mode_val = self.dataset_context_config.download_mode.value
use_streaming = self.dataset_context_config.use_streaming
input_config_kwargs = self.dataset_context_config.config_kwargs
trust_remote_code = self.dataset_context_config.trust_remote_code
# To use the huggingface data loader
if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER:
@@ -118,6 +119,7 @@ class RemoteDataLoaderManager(DataLoaderManager):
download_mode=download_mode_val,
streaming=use_streaming,
ignore_verifications=True,
trust_remote_code=trust_remote_code,
**input_config_kwargs)
# download statistics
self.api.dataset_download_statistics(

View File

@@ -168,6 +168,7 @@ class MsDataset:
custom_cfg: Optional[Config] = Config(),
token: Optional[str] = None,
dataset_info_only: Optional[bool] = False,
trust_remote_code: Optional[bool] = True,
**config_kwargs,
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
@@ -198,6 +199,7 @@ class MsDataset:
see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
token (str, Optional): SDK token of ModelScope.
dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
trust_remote_code (bool, Optional): If set to True, trust the remote code.
**config_kwargs (additional keyword arguments): Keyword arguments to be passed
Returns:
@@ -250,6 +252,7 @@ class MsDataset:
cache_root_dir=cache_dir,
use_streaming=use_streaming,
stream_batch_size=stream_batch_size,
trust_remote_code=trust_remote_code,
**config_kwargs)
# Load from local disk
@@ -275,6 +278,7 @@ class MsDataset:
split=split,
streaming=use_streaming,
download_mode=download_mode.value,
trust_remote_code=trust_remote_code,
**config_kwargs)
# Load from the modelscope hub
@@ -303,7 +307,7 @@ class MsDataset:
token=token,
streaming=use_streaming,
dataset_info_only=dataset_info_only,
trust_remote_code=True,
trust_remote_code=trust_remote_code,
**config_kwargs) as dataset_res:
return dataset_res

View File

@@ -824,7 +824,7 @@ def get_module_with_script(self) -> DatasetModule:
name=self.name,
)
if not os.path.exists(importable_file_path):
trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name)
trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
if trust_remote_code:
_create_importable_file(
local_path=local_script_path,
@@ -884,7 +884,7 @@ class DatasetsWrapperHF:
streaming: bool = False,
num_proc: Optional[int] = None,
storage_options: Optional[Dict] = None,
trust_remote_code: bool = None,
trust_remote_code: bool = True,
dataset_info_only: Optional[bool] = False,
**config_kwargs,
) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset,

View File

@@ -4,7 +4,7 @@ from typing import Any, Dict, List
import requests
from modelscope import AutoTokenizer, get_logger, snapshot_download
from modelscope import AutoTokenizer, get_logger, snapshot_download, AutoConfig
from . import TemplateType
from .base import Template, get_template
@@ -83,7 +83,7 @@ template_info = [
TemplateInfo(
template=TemplateType.chatml,
template_regex=
f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*{chat_suffix}.*',
f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*',
modelfile_link=
'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-1.5.modelfile',
),
@@ -110,6 +110,10 @@ template_info = [
'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/glm4.modelfile',
),
TemplateInfo(
template_regex=f'.*{cases("llava-llama-3")}.*',
modelfile_link='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava-llama-3.modelfile'),
# baichuan
TemplateInfo(
template=TemplateType.baichuan,
@@ -258,6 +262,7 @@ class TemplateLoader:
"""
ignore_file_pattern = [r'.+\.bin$', r'.+\.safetensors$', r'.+\.gguf$']
tokenizer = kwargs.get('tokenizer')
config = kwargs.get('config')
for _info in template_info:
if re.fullmatch(_info.template_regex, model_id):
if _info.template:
@@ -269,10 +274,11 @@ class TemplateLoader:
ignore_file_pattern=ignore_file_pattern)
tokenizer = AutoTokenizer.from_pretrained(
model_dir, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
except Exception:
pass
return TemplateLoader.load_by_template_name(
_info.template, tokenizer=tokenizer, **kwargs)
_info.template, tokenizer=tokenizer, config=config, **kwargs)
@staticmethod
def load_by_template_name(template_name: str, **kwargs) -> Template:
@@ -288,7 +294,9 @@ class TemplateLoader:
Returns:
The template instance
"""
return get_template(template_name, tokenizer=kwargs.pop('tokenizer', None), **kwargs)
template = get_template(template_name, tokenizer=kwargs.pop('tokenizer', None), **kwargs)
template.config = kwargs.get('config')
return template
@staticmethod
def replace_and_concat(template: Template, template_list: List,
@@ -326,33 +334,41 @@ class TemplateLoader:
Returns:
The ModelFile content, returns `None` if no template found
"""
if not model_id and not template_name:
raise ValueError(
f'Please make sure you model_id: {model_id} '
f'and template_name: {template_name} is supported.')
logger.info('Exporting to ollama:')
if model_id:
for _info in template_info:
if re.fullmatch(_info.template_regex, model_id):
if _info.modelfile_link:
if _info.modelfile_link and not kwargs.get('ignore_oss_model_file', False):
return TemplateLoader._read_content_from_url(
_info.modelfile_link)
elif _info.template and not template_name:
template_name = _info.template
if template_name:
template = TemplateLoader.load_by_template_name(
template_name, **kwargs)
else:
raise ValueError(
f'Please make sure you model_id: {model_id} '
f'and template_name: {template_name} is supported.')
template = TemplateLoader.load_by_model_id(
model_id, **kwargs)
if template is None:
return None
content = ''
content += 'FROM {{gguf_file}}\n'
content += (
f'TEMPLATE """{{{{ if .System }}}}'
f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
f'{{{{ else }}}}{TemplateLoader.replace_and_concat(template, template.prefix, "", "")}'
f'{{{{ end }}}}')
content += 'FROM {gguf_file}\n'
_prefix = TemplateLoader.replace_and_concat(template, template.prefix, "", "")
if _prefix:
content += (
f'TEMPLATE """{{{{ if .System }}}}'
f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
f'{{{{ else }}}}{_prefix}'
f'{{{{ end }}}}')
else:
content += (
f'TEMPLATE """{{{{ if .System }}}}'
f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
f'{{{{ end }}}}')
content += (
f'{{{{ if .Prompt }}}}'
f'{TemplateLoader.replace_and_concat(template, template.prompt, "{{QUERY}}", "{{ .Prompt }}")}'
@@ -360,7 +376,16 @@ class TemplateLoader:
content += '{{ .Response }}'
content += TemplateLoader.replace_and_concat(template, template.suffix,
'', '') + '"""\n'
content += f'PARAMETER stop "{TemplateLoader.replace_and_concat(template, template.suffix, "", "")}"\n'
all_eos_tokens = {TemplateLoader.replace_and_concat(template, template.suffix, "", "")}
if getattr(template, 'tokenizer', None):
eos_token = TemplateLoader.replace_and_concat(template, [["eos_token_id"]], "", "")
all_eos_tokens.add(eos_token)
if getattr(template, 'config', None) and getattr(template.config, 'eos_token_id'):
eos_token_id = template.config.eos_token_id
eos_token = TemplateLoader.replace_and_concat(template, [[eos_token_id]], "", "")
all_eos_tokens.add(eos_token)
for eos_token in all_eos_tokens:
content += f'PARAMETER stop "{eos_token}"\n'
return content
@staticmethod

View File

@@ -100,6 +100,14 @@ class TestToOllama(unittest.TestCase):
ollama = TemplateLoader.to_ollama(
'QuantFactory/Mistral-Nemo-Japanese-Instruct-2408-GGUF')
self.assertTrue(ollama is not None)
ollama = TemplateLoader.to_ollama('AI-ModelScope/Yi-1.5-9B-32K-GGUF')
self.assertTrue(ollama is not None)
ollama = TemplateLoader.to_ollama(
'AI-ModelScope/llava-llama-3-8b-v1_1-gguf')
self.assertTrue(ollama is not None)
ollama = TemplateLoader.to_ollama(
'01ai/Yi-1.5-9B-Chat', ignore_oss_model_file=True)
self.assertTrue(ollama is not None)
if __name__ == '__main__':