diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index ad900bab..e90f397b 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -563,6 +563,18 @@ class MsDataset: self._hf_ds.reset_format() return self._hf_ds + def remap_columns(self, column_mapping: Dict[str, str]) -> Dataset: + """ + Rename columns and return the underlying hf dataset directly + TODO: support native MsDataset column rename. + Args: + column_mapping: the mapping of the original and new column names + Returns: + underlying hf dataset + """ + self._hf_ds.reset_format() + return self._hf_ds.rename_columns(column_mapping) + @staticmethod def upload(object_name: str, local_file_path: str, diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py index 72196fba..4972a731 100644 --- a/tests/trainers/test_finetune_mplug.py +++ b/tests/trainers/test_finetune_mplug.py @@ -24,17 +24,16 @@ class TestFinetuneMPlug(unittest.TestCase): datadict = MsDataset.load( 'coco_captions_small_slice', download_mode=DownloadMode.FORCE_REDOWNLOAD) - self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map( - lambda _: { - 'question': 'what the picture describes?' - }).rename_column('image:FILE', - 'image').rename_column('answer:Value', 'answer')) - self.test_dataset = MsDataset(datadict['test'].to_hf_dataset().map( - lambda _: { - 'question': 'what the picture describes?' - }).rename_column('image:FILE', - 'image').rename_column('answer:Value', 'answer')) - + self.train_dataset = MsDataset( + datadict['train'].remap_columns({ + 'image:FILE': 'image', + 'answer:Value': 'answer' + }).map(lambda _: {'question': 'what the picture describes?'})) + self.test_dataset = MsDataset( + datadict['test'].remap_columns({ + 'image:FILE': 'image', + 'answer:Value': 'answer' + }).map(lambda _: {'question': 'what the picture describes?'})) self.max_epochs = 2 def tearDown(self): diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py index 63d4577b..59bef51c 100644 --- a/tests/trainers/test_finetune_text_generation.py +++ b/tests/trainers/test_finetune_text_generation.py @@ -130,10 +130,16 @@ class TestFinetuneTextGeneration(unittest.TestCase): def test_finetune_cnndm(self): from modelscope.msdatasets import MsDataset dataset_dict = MsDataset.load('DuReader_robust-QG') - train_dataset = dataset_dict['train'].to_hf_dataset() \ - .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) - eval_dataset = dataset_dict['validation'].to_hf_dataset() \ - .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) + train_dataset = dataset_dict['train'].remap_columns({ + 'text1': 'src_txt', + 'text2': 'tgt_txt' + }) + eval_dataset = dataset_dict['validation'].remap_columns({ + 'text1': + 'src_txt', + 'text2': + 'tgt_txt' + }) num_warmup_steps = 200 os.environ['LOCAL_RANK'] = '0'