mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 20:19:51 +01:00
ok Merge branch 'master' of github.com:modelscope/modelscope into release/1.19
This commit is contained in:
@@ -17,7 +17,7 @@ class DatasetContextConfig:
|
||||
data_files: Union[str, Sequence[str],
|
||||
Mapping[str, Union[str, Sequence[str]]]],
|
||||
download_mode: DownloadMode, cache_root_dir: str,
|
||||
use_streaming: bool, stream_batch_size: int, **kwargs):
|
||||
use_streaming: bool, stream_batch_size: int, trust_remote_code: bool, **kwargs):
|
||||
|
||||
self._download_config = None
|
||||
self._data_meta_config = None
|
||||
@@ -44,6 +44,7 @@ class DatasetContextConfig:
|
||||
self.use_streaming = use_streaming
|
||||
self.stream_batch_size = stream_batch_size
|
||||
self.download_virgo_files: bool = False
|
||||
self.trust_remote_code: bool = trust_remote_code
|
||||
|
||||
@property
|
||||
def config_kwargs(self) -> dict:
|
||||
|
||||
@@ -127,6 +127,7 @@ class OssDownloader(BaseDownloader):
|
||||
cache_dir = self.dataset_context_config.cache_root_dir
|
||||
download_mode = self.dataset_context_config.download_mode
|
||||
input_kwargs = self.dataset_context_config.config_kwargs
|
||||
trust_remote_code = self.dataset_context_config.trust_remote_code
|
||||
|
||||
if self.builder is None and not dataset_py_script:
|
||||
raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
|
||||
@@ -141,7 +142,7 @@ class OssDownloader(BaseDownloader):
|
||||
data_files=data_files,
|
||||
cache_dir=cache_dir,
|
||||
download_mode=download_mode.value,
|
||||
ignore_verifications=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**input_kwargs)
|
||||
else:
|
||||
self.dataset = self.data_files_manager.fetch_data_files(
|
||||
|
||||
@@ -105,6 +105,7 @@ class RemoteDataLoaderManager(DataLoaderManager):
|
||||
download_mode_val = self.dataset_context_config.download_mode.value
|
||||
use_streaming = self.dataset_context_config.use_streaming
|
||||
input_config_kwargs = self.dataset_context_config.config_kwargs
|
||||
trust_remote_code = self.dataset_context_config.trust_remote_code
|
||||
|
||||
# To use the huggingface data loader
|
||||
if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER:
|
||||
@@ -118,6 +119,7 @@ class RemoteDataLoaderManager(DataLoaderManager):
|
||||
download_mode=download_mode_val,
|
||||
streaming=use_streaming,
|
||||
ignore_verifications=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**input_config_kwargs)
|
||||
# download statistics
|
||||
self.api.dataset_download_statistics(
|
||||
|
||||
@@ -168,6 +168,7 @@ class MsDataset:
|
||||
custom_cfg: Optional[Config] = Config(),
|
||||
token: Optional[str] = None,
|
||||
dataset_info_only: Optional[bool] = False,
|
||||
trust_remote_code: Optional[bool] = True,
|
||||
**config_kwargs,
|
||||
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
|
||||
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
|
||||
@@ -198,6 +199,7 @@ class MsDataset:
|
||||
see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
|
||||
token (str, Optional): SDK token of ModelScope.
|
||||
dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
|
||||
trust_remote_code (bool, Optional): If set to True, trust the remote code.
|
||||
**config_kwargs (additional keyword arguments): Keyword arguments to be passed
|
||||
|
||||
Returns:
|
||||
@@ -250,6 +252,7 @@ class MsDataset:
|
||||
cache_root_dir=cache_dir,
|
||||
use_streaming=use_streaming,
|
||||
stream_batch_size=stream_batch_size,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**config_kwargs)
|
||||
|
||||
# Load from local disk
|
||||
@@ -275,6 +278,7 @@ class MsDataset:
|
||||
split=split,
|
||||
streaming=use_streaming,
|
||||
download_mode=download_mode.value,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**config_kwargs)
|
||||
|
||||
# Load from the modelscope hub
|
||||
@@ -303,7 +307,7 @@ class MsDataset:
|
||||
token=token,
|
||||
streaming=use_streaming,
|
||||
dataset_info_only=dataset_info_only,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**config_kwargs) as dataset_res:
|
||||
|
||||
return dataset_res
|
||||
|
||||
@@ -824,7 +824,7 @@ def get_module_with_script(self) -> DatasetModule:
|
||||
name=self.name,
|
||||
)
|
||||
if not os.path.exists(importable_file_path):
|
||||
trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name)
|
||||
trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
|
||||
if trust_remote_code:
|
||||
_create_importable_file(
|
||||
local_path=local_script_path,
|
||||
@@ -884,7 +884,7 @@ class DatasetsWrapperHF:
|
||||
streaming: bool = False,
|
||||
num_proc: Optional[int] = None,
|
||||
storage_options: Optional[Dict] = None,
|
||||
trust_remote_code: bool = None,
|
||||
trust_remote_code: bool = True,
|
||||
dataset_info_only: Optional[bool] = False,
|
||||
**config_kwargs,
|
||||
) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset,
|
||||
|
||||
@@ -4,7 +4,7 @@ from typing import Any, Dict, List
|
||||
|
||||
import requests
|
||||
|
||||
from modelscope import AutoTokenizer, get_logger, snapshot_download
|
||||
from modelscope import AutoTokenizer, get_logger, snapshot_download, AutoConfig
|
||||
from . import TemplateType
|
||||
from .base import Template, get_template
|
||||
|
||||
@@ -83,7 +83,7 @@ template_info = [
|
||||
TemplateInfo(
|
||||
template=TemplateType.chatml,
|
||||
template_regex=
|
||||
f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*{chat_suffix}.*',
|
||||
f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*',
|
||||
modelfile_link=
|
||||
'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-1.5.modelfile',
|
||||
),
|
||||
@@ -110,6 +110,10 @@ template_info = [
|
||||
'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/glm4.modelfile',
|
||||
),
|
||||
|
||||
TemplateInfo(
|
||||
template_regex=f'.*{cases("llava-llama-3")}.*',
|
||||
modelfile_link='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava-llama-3.modelfile'),
|
||||
|
||||
# baichuan
|
||||
TemplateInfo(
|
||||
template=TemplateType.baichuan,
|
||||
@@ -258,6 +262,7 @@ class TemplateLoader:
|
||||
"""
|
||||
ignore_file_pattern = [r'.+\.bin$', r'.+\.safetensors$', r'.+\.gguf$']
|
||||
tokenizer = kwargs.get('tokenizer')
|
||||
config = kwargs.get('config')
|
||||
for _info in template_info:
|
||||
if re.fullmatch(_info.template_regex, model_id):
|
||||
if _info.template:
|
||||
@@ -269,10 +274,11 @@ class TemplateLoader:
|
||||
ignore_file_pattern=ignore_file_pattern)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_dir, trust_remote_code=True)
|
||||
config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
|
||||
except Exception:
|
||||
pass
|
||||
return TemplateLoader.load_by_template_name(
|
||||
_info.template, tokenizer=tokenizer, **kwargs)
|
||||
_info.template, tokenizer=tokenizer, config=config, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def load_by_template_name(template_name: str, **kwargs) -> Template:
|
||||
@@ -288,7 +294,9 @@ class TemplateLoader:
|
||||
Returns:
|
||||
The template instance
|
||||
"""
|
||||
return get_template(template_name, tokenizer=kwargs.pop('tokenizer', None), **kwargs)
|
||||
template = get_template(template_name, tokenizer=kwargs.pop('tokenizer', None), **kwargs)
|
||||
template.config = kwargs.get('config')
|
||||
return template
|
||||
|
||||
@staticmethod
|
||||
def replace_and_concat(template: Template, template_list: List,
|
||||
@@ -326,33 +334,41 @@ class TemplateLoader:
|
||||
Returns:
|
||||
The ModelFile content, returns `None` if no template found
|
||||
"""
|
||||
if not model_id and not template_name:
|
||||
raise ValueError(
|
||||
f'Please make sure you model_id: {model_id} '
|
||||
f'and template_name: {template_name} is supported.')
|
||||
logger.info('Exporting to ollama:')
|
||||
if model_id:
|
||||
for _info in template_info:
|
||||
if re.fullmatch(_info.template_regex, model_id):
|
||||
if _info.modelfile_link:
|
||||
if _info.modelfile_link and not kwargs.get('ignore_oss_model_file', False):
|
||||
return TemplateLoader._read_content_from_url(
|
||||
_info.modelfile_link)
|
||||
elif _info.template and not template_name:
|
||||
template_name = _info.template
|
||||
if template_name:
|
||||
template = TemplateLoader.load_by_template_name(
|
||||
template_name, **kwargs)
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Please make sure you model_id: {model_id} '
|
||||
f'and template_name: {template_name} is supported.')
|
||||
template = TemplateLoader.load_by_model_id(
|
||||
model_id, **kwargs)
|
||||
|
||||
if template is None:
|
||||
return None
|
||||
|
||||
content = ''
|
||||
content += 'FROM {{gguf_file}}\n'
|
||||
content += (
|
||||
f'TEMPLATE """{{{{ if .System }}}}'
|
||||
f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
|
||||
f'{{{{ else }}}}{TemplateLoader.replace_and_concat(template, template.prefix, "", "")}'
|
||||
f'{{{{ end }}}}')
|
||||
content += 'FROM {gguf_file}\n'
|
||||
_prefix = TemplateLoader.replace_and_concat(template, template.prefix, "", "")
|
||||
if _prefix:
|
||||
content += (
|
||||
f'TEMPLATE """{{{{ if .System }}}}'
|
||||
f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
|
||||
f'{{{{ else }}}}{_prefix}'
|
||||
f'{{{{ end }}}}')
|
||||
else:
|
||||
content += (
|
||||
f'TEMPLATE """{{{{ if .System }}}}'
|
||||
f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
|
||||
f'{{{{ end }}}}')
|
||||
content += (
|
||||
f'{{{{ if .Prompt }}}}'
|
||||
f'{TemplateLoader.replace_and_concat(template, template.prompt, "{{QUERY}}", "{{ .Prompt }}")}'
|
||||
@@ -360,7 +376,16 @@ class TemplateLoader:
|
||||
content += '{{ .Response }}'
|
||||
content += TemplateLoader.replace_and_concat(template, template.suffix,
|
||||
'', '') + '"""\n'
|
||||
content += f'PARAMETER stop "{TemplateLoader.replace_and_concat(template, template.suffix, "", "")}"\n'
|
||||
all_eos_tokens = {TemplateLoader.replace_and_concat(template, template.suffix, "", "")}
|
||||
if getattr(template, 'tokenizer', None):
|
||||
eos_token = TemplateLoader.replace_and_concat(template, [["eos_token_id"]], "", "")
|
||||
all_eos_tokens.add(eos_token)
|
||||
if getattr(template, 'config', None) and getattr(template.config, 'eos_token_id'):
|
||||
eos_token_id = template.config.eos_token_id
|
||||
eos_token = TemplateLoader.replace_and_concat(template, [[eos_token_id]], "", "")
|
||||
all_eos_tokens.add(eos_token)
|
||||
for eos_token in all_eos_tokens:
|
||||
content += f'PARAMETER stop "{eos_token}"\n'
|
||||
return content
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -100,6 +100,14 @@ class TestToOllama(unittest.TestCase):
|
||||
ollama = TemplateLoader.to_ollama(
|
||||
'QuantFactory/Mistral-Nemo-Japanese-Instruct-2408-GGUF')
|
||||
self.assertTrue(ollama is not None)
|
||||
ollama = TemplateLoader.to_ollama('AI-ModelScope/Yi-1.5-9B-32K-GGUF')
|
||||
self.assertTrue(ollama is not None)
|
||||
ollama = TemplateLoader.to_ollama(
|
||||
'AI-ModelScope/llava-llama-3-8b-v1_1-gguf')
|
||||
self.assertTrue(ollama is not None)
|
||||
ollama = TemplateLoader.to_ollama(
|
||||
'01ai/Yi-1.5-9B-Chat', ignore_oss_model_file=True)
|
||||
self.assertTrue(ollama is not None)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user