from typing import List, Optional, Tuple

import numpy as np
from datasets import Dataset as HfDataset
from datasets import concatenate_datasets
from numpy.random import RandomState

from modelscope import MsDataset


def _processing_alpaca(dataset: HfDataset) -> HfDataset:
    instruction = dataset['instruction']
    input_ = dataset['input']
    res = []
    for inst, inp in zip(instruction, input_):
        if inp is not None and inp != '':
            if inp.startswith('输入：'):
                inp = inp[3:]
            inst = f'{inst}\n{inp}'
        res.append(inst)
    dataset = HfDataset.from_dict({
        'instruction': res,
        'output': dataset['output']
    })
    return dataset


def _processing_multi_alpaca(datasets: [HfDataset, List]) -> HfDataset:
    output = []
    res = []

    if not isinstance(datasets, List):
        datasets = [datasets]
    for dataset in datasets:
        instruction = dataset['instruction']
        input_ = dataset['input']
        output_ = dataset['output']
        for inst, inp, opt in zip(instruction, input_, output_):
            if inp is not None and inp != '':
                if inp.startswith('输入：'):
                    inp = inp[3:]
                inst = f'{inst}\n{inp}'
            if opt is not None and opt != '':
                res.append(inst)
                output.append(opt)
    dataset = HfDataset.from_dict({'instruction': res, 'output': output})
    return dataset


def get_alpaca_en_dataset() -> HfDataset:
    dataset_en: HfDataset = MsDataset.load(
        'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
    dataset_en = dataset_en.remove_columns(['text'])
    return _processing_alpaca(dataset_en)


def get_alpaca_zh_dataset() -> HfDataset:
    dataset_zh: HfDataset = MsDataset.load(
        'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset()
    return _processing_alpaca(dataset_zh)


def get_multi_alpaca_dataset() -> HfDataset:
    dataset_multi = []
    for subset_name in [
            'ar', 'de', 'es', 'fr', 'id', 'ja', 'ko', 'pt', 'ru', 'th', 'vi'
    ]:
        dataset_sub: HfDataset = MsDataset.load(
            'damo/nlp_polylm_multialpaca_sft',
            subset_name=subset_name,
            split='train').to_hf_dataset()
        dataset_multi.append(dataset_sub)
    return _processing_multi_alpaca(dataset_multi)


def get_seed(random_state: RandomState) -> int:
    seed_max = np.iinfo(np.int32).max
    seed = random_state.randint(0, seed_max)
    return seed


def process_dataset(dataset: HfDataset, dataset_test_size: float,
                    dataset_sample: Optional[int],
                    dataset_seed: int) -> Tuple[HfDataset, HfDataset]:
    random_state = np.random.RandomState(dataset_seed)
    if dataset_sample is not None:
        index = random_state.permutation(len(dataset))[:dataset_sample]
        dataset = dataset.select(index)
    dataset = dataset.train_test_split(
        dataset_test_size, seed=get_seed(random_state))
    return dataset['train'], dataset['test']


DATASET_MAPPER = {
    'alpaca-en': get_alpaca_en_dataset,
    'alpaca-zh': get_alpaca_zh_dataset,
    'alpaca-multi': get_multi_alpaca_dataset,
}


def get_dataset(dataset_name_list: List[str]) -> HfDataset:
    dataset_list = []
    for dataset_name in dataset_name_list:
        get_function = DATASET_MAPPER[dataset_name]
        dataset_list.append(get_function())
    dataset = concatenate_datasets(dataset_list)
    return dataset