From a3a6834f0ddfdf87a47fded09b51c0e653d0cd53 Mon Sep 17 00:00:00 2001 From: zsl01670416 Date: Fri, 28 Jul 2023 19:12:21 +0800 Subject: [PATCH] Merge branch 'debug_chatglm6b_json_dataset' fix conflict between hf dataset and to_hf_dataset The type of dataset built from file is hf dataset, which can not use function to_hf_dataset. --- examples/pytorch/chatglm6b/finetune.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/chatglm6b/finetune.py b/examples/pytorch/chatglm6b/finetune.py index 0e31ce28..5e7ff6a5 100644 --- a/examples/pytorch/chatglm6b/finetune.py +++ b/examples/pytorch/chatglm6b/finetune.py @@ -192,12 +192,12 @@ if args.dataset_json_file is None: args.train_dataset_name, subset_name=args.train_subset_name, split=args.train_split, - namespace=args.train_dataset_namespace) + namespace=args.train_dataset_namespace).to_hf_dataset() validation_dataset = MsDataset.load( args.val_dataset_name, subset_name=args.val_subset_name, split=args.val_split, - namespace=args.val_dataset_namespace) + namespace=args.val_dataset_namespace).to_hf_dataset() else: train_dataset, validation_dataset = build_dataset_from_file( args.dataset_json_file) @@ -364,14 +364,14 @@ def preprocess_function_train(examples): return model_inputs -train_dataset = train_dataset.to_hf_dataset().map( +train_dataset = train_dataset.map( preprocess_function_train, batched=True, num_proc=args.preprocessing_num_workers, desc='Running tokenizer on train dataset', ) -validation_dataset = validation_dataset.to_hf_dataset().map( +validation_dataset = validation_dataset.map( preprocess_function_eval, batched=True, num_proc=args.preprocessing_num_workers,