Merge branch 'debug_chatglm6b_json_dataset' fix conflict between hf dataset and to_hf_dataset

The type of dataset built from file is hf dataset, which can not use function to_hf_dataset.
This commit is contained in:
zsl01670416
2023-07-28 19:12:21 +08:00
committed by suluyan.sly
parent d5e45e3132
commit a3a6834f0d

View File

@@ -192,12 +192,12 @@ if args.dataset_json_file is None:
args.train_dataset_name,
subset_name=args.train_subset_name,
split=args.train_split,
namespace=args.train_dataset_namespace)
namespace=args.train_dataset_namespace).to_hf_dataset()
validation_dataset = MsDataset.load(
args.val_dataset_name,
subset_name=args.val_subset_name,
split=args.val_split,
namespace=args.val_dataset_namespace)
namespace=args.val_dataset_namespace).to_hf_dataset()
else:
train_dataset, validation_dataset = build_dataset_from_file(
args.dataset_json_file)
@@ -364,14 +364,14 @@ def preprocess_function_train(examples):
return model_inputs
train_dataset = train_dataset.to_hf_dataset().map(
train_dataset = train_dataset.map(
preprocess_function_train,
batched=True,
num_proc=args.preprocessing_num_workers,
desc='Running tokenizer on train dataset',
)
validation_dataset = validation_dataset.to_hf_dataset().map(
validation_dataset = validation_dataset.map(
preprocess_function_eval,
batched=True,
num_proc=args.preprocessing_num_workers,