Files
modelscope/examples/pytorch/llm/utils/dataset.py
2023-08-29 22:03:52 +08:00

147 lines
4.4 KiB
Python

from functools import partial
from typing import Callable, List, Optional, Tuple
import numpy as np
from datasets import Dataset as HfDataset
from datasets import concatenate_datasets
from swift.utils import get_seed
from modelscope import MsDataset
def _processing_alpaca(
dataset: HfDataset,
preprocess_input: Optional[Callable[[str], str]] = None) -> HfDataset:
instruction = dataset['instruction']
input_ = dataset['input']
new_instruction = []
for inst, inp in zip(instruction, input_):
if inp is None:
inp = ''
if preprocess_input is not None:
inp = preprocess_input(inp)
inst = f'{inst}\n{inp}'
new_instruction.append(inst)
dataset = HfDataset.from_dict({
'instruction': new_instruction,
'output': dataset['output']
})
return dataset
def get_alpaca_gpt4_en_dataset() -> HfDataset:
dataset: HfDataset = MsDataset.load(
'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
return _processing_alpaca(dataset)
def get_alpaca_gpt4_zh_dataset() -> HfDataset:
dataset: HfDataset = MsDataset.load(
'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset()
def _preprocess_input(inp: str) -> str:
if inp.startswith('输入:'):
inp = inp[3:]
return inp
return _processing_alpaca(dataset, _preprocess_input)
def get_finance_en_dataset() -> HfDataset:
dataset: HfDataset = MsDataset.load(
'wyj123456/finance_en', split='train').to_hf_dataset()
return _processing_alpaca(dataset)
_multi_alpaca_language_list = [
'ar', 'de', 'es', 'fr', 'id', 'ja', 'ko', 'pt', 'ru', 'th', 'vi'
]
def get_multi_alpaca(subset_name: str) -> HfDataset:
"""
subset_name:
Language-key Language # examples
ar Arabic 14,671
de German 9,515
es Spanish 9,958
fr France 11,332
id Indonesian 12,117
ja Japanese 10,191
ko Korean 14,402
pt Portuguese 10,825
ru Russian 14,286
th Thai 11,496
vi Vietnamese 13,908
"""
dataset: HfDataset = MsDataset.load(
'damo/nlp_polylm_multialpaca_sft',
subset_name=subset_name,
split='train').to_hf_dataset()
return _processing_alpaca(dataset)
def get_multi_alpaca_all() -> HfDataset:
dataset_list = []
for subset_name in _multi_alpaca_language_list:
dataset = get_multi_alpaca(subset_name)
dataset_list.append(dataset)
dataset = concatenate_datasets(dataset_list)
return dataset
def get_code_alpaca_en_dataset() -> HfDataset:
dataset: HfDataset = MsDataset.load(
'wyj123456/code_alpaca_en', split='train').to_hf_dataset()
return _processing_alpaca(dataset)
def get_instinwild_zh_dataset():
dataset: HfDataset = MsDataset.load(
'wyj123456/instinwild', subset_name='default',
split='train').to_hf_dataset()
return _processing_alpaca(dataset)
def get_instinwild_en_dataset():
dataset: HfDataset = MsDataset.load(
'wyj123456/instinwild', subset_name='subset',
split='train').to_hf_dataset()
return _processing_alpaca(dataset)
DATASET_MAPPING = {
'alpaca-en': get_alpaca_gpt4_en_dataset,
'alpaca-zh': get_alpaca_gpt4_zh_dataset,
'finance-en': get_finance_en_dataset,
'multi-alpaca-all': get_multi_alpaca_all,
**{
f'multi-alpaca-{k}': partial(get_multi_alpaca, k)
for k in _multi_alpaca_language_list
},
'code-en': get_code_alpaca_en_dataset,
'instinwild-zh': get_instinwild_zh_dataset,
'instinwild-en': get_instinwild_en_dataset,
}
def get_dataset(dataset_name_list: List[str]) -> HfDataset:
dataset_list = []
for dataset_name in dataset_name_list:
get_function = DATASET_MAPPING[dataset_name]
dataset_list.append(get_function())
dataset = concatenate_datasets(dataset_list)
return dataset
def process_dataset(dataset: HfDataset, dataset_test_size: float,
dataset_sample: int,
dataset_seed: int) -> Tuple[HfDataset, HfDataset]:
random_state = np.random.RandomState(dataset_seed)
if dataset_sample >= 0:
index = random_state.permutation(len(dataset))[:dataset_sample]
dataset = dataset.select(index)
dataset = dataset.train_test_split(
dataset_test_size, seed=get_seed(random_state))
return dataset['train'], dataset['test']