mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-17 00:37:43 +01:00
147 lines
4.4 KiB
Python
147 lines
4.4 KiB
Python
from functools import partial
|
|
from typing import Callable, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
from datasets import Dataset as HfDataset
|
|
from datasets import concatenate_datasets
|
|
from swift.utils import get_seed
|
|
|
|
from modelscope import MsDataset
|
|
|
|
|
|
def _processing_alpaca(
|
|
dataset: HfDataset,
|
|
preprocess_input: Optional[Callable[[str], str]] = None) -> HfDataset:
|
|
instruction = dataset['instruction']
|
|
input_ = dataset['input']
|
|
new_instruction = []
|
|
for inst, inp in zip(instruction, input_):
|
|
if inp is None:
|
|
inp = ''
|
|
if preprocess_input is not None:
|
|
inp = preprocess_input(inp)
|
|
inst = f'{inst}\n{inp}'
|
|
new_instruction.append(inst)
|
|
dataset = HfDataset.from_dict({
|
|
'instruction': new_instruction,
|
|
'output': dataset['output']
|
|
})
|
|
return dataset
|
|
|
|
|
|
def get_alpaca_gpt4_en_dataset() -> HfDataset:
|
|
dataset: HfDataset = MsDataset.load(
|
|
'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
|
|
return _processing_alpaca(dataset)
|
|
|
|
|
|
def get_alpaca_gpt4_zh_dataset() -> HfDataset:
|
|
dataset: HfDataset = MsDataset.load(
|
|
'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset()
|
|
|
|
def _preprocess_input(inp: str) -> str:
|
|
if inp.startswith('输入:'):
|
|
inp = inp[3:]
|
|
return inp
|
|
|
|
return _processing_alpaca(dataset, _preprocess_input)
|
|
|
|
|
|
def get_finance_en_dataset() -> HfDataset:
|
|
dataset: HfDataset = MsDataset.load(
|
|
'wyj123456/finance_en', split='train').to_hf_dataset()
|
|
return _processing_alpaca(dataset)
|
|
|
|
|
|
_multi_alpaca_language_list = [
|
|
'ar', 'de', 'es', 'fr', 'id', 'ja', 'ko', 'pt', 'ru', 'th', 'vi'
|
|
]
|
|
|
|
|
|
def get_multi_alpaca(subset_name: str) -> HfDataset:
|
|
"""
|
|
subset_name:
|
|
Language-key Language # examples
|
|
ar Arabic 14,671
|
|
de German 9,515
|
|
es Spanish 9,958
|
|
fr France 11,332
|
|
id Indonesian 12,117
|
|
ja Japanese 10,191
|
|
ko Korean 14,402
|
|
pt Portuguese 10,825
|
|
ru Russian 14,286
|
|
th Thai 11,496
|
|
vi Vietnamese 13,908
|
|
"""
|
|
dataset: HfDataset = MsDataset.load(
|
|
'damo/nlp_polylm_multialpaca_sft',
|
|
subset_name=subset_name,
|
|
split='train').to_hf_dataset()
|
|
return _processing_alpaca(dataset)
|
|
|
|
|
|
def get_multi_alpaca_all() -> HfDataset:
|
|
dataset_list = []
|
|
for subset_name in _multi_alpaca_language_list:
|
|
dataset = get_multi_alpaca(subset_name)
|
|
dataset_list.append(dataset)
|
|
dataset = concatenate_datasets(dataset_list)
|
|
return dataset
|
|
|
|
|
|
def get_code_alpaca_en_dataset() -> HfDataset:
|
|
dataset: HfDataset = MsDataset.load(
|
|
'wyj123456/code_alpaca_en', split='train').to_hf_dataset()
|
|
return _processing_alpaca(dataset)
|
|
|
|
|
|
def get_instinwild_zh_dataset():
|
|
dataset: HfDataset = MsDataset.load(
|
|
'wyj123456/instinwild', subset_name='default',
|
|
split='train').to_hf_dataset()
|
|
return _processing_alpaca(dataset)
|
|
|
|
|
|
def get_instinwild_en_dataset():
|
|
dataset: HfDataset = MsDataset.load(
|
|
'wyj123456/instinwild', subset_name='subset',
|
|
split='train').to_hf_dataset()
|
|
return _processing_alpaca(dataset)
|
|
|
|
|
|
DATASET_MAPPING = {
|
|
'alpaca-en': get_alpaca_gpt4_en_dataset,
|
|
'alpaca-zh': get_alpaca_gpt4_zh_dataset,
|
|
'finance-en': get_finance_en_dataset,
|
|
'multi-alpaca-all': get_multi_alpaca_all,
|
|
**{
|
|
f'multi-alpaca-{k}': partial(get_multi_alpaca, k)
|
|
for k in _multi_alpaca_language_list
|
|
},
|
|
'code-en': get_code_alpaca_en_dataset,
|
|
'instinwild-zh': get_instinwild_zh_dataset,
|
|
'instinwild-en': get_instinwild_en_dataset,
|
|
}
|
|
|
|
|
|
def get_dataset(dataset_name_list: List[str]) -> HfDataset:
|
|
dataset_list = []
|
|
for dataset_name in dataset_name_list:
|
|
get_function = DATASET_MAPPING[dataset_name]
|
|
dataset_list.append(get_function())
|
|
dataset = concatenate_datasets(dataset_list)
|
|
return dataset
|
|
|
|
|
|
def process_dataset(dataset: HfDataset, dataset_test_size: float,
|
|
dataset_sample: int,
|
|
dataset_seed: int) -> Tuple[HfDataset, HfDataset]:
|
|
random_state = np.random.RandomState(dataset_seed)
|
|
if dataset_sample >= 0:
|
|
index = random_state.permutation(len(dataset))[:dataset_sample]
|
|
dataset = dataset.select(index)
|
|
dataset = dataset.train_test_split(
|
|
dataset_test_size, seed=get_seed(random_state))
|
|
return dataset['train'], dataset['test']
|