mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-25 04:30:48 +01:00
Merge branch 'debug_chatglm6b_json_dataset' fix conflict between hf dataset and to_hf_dataset
The type of dataset built from file is hf dataset, which can not use function to_hf_dataset.
This commit is contained in:
@@ -192,12 +192,12 @@ if args.dataset_json_file is None:
|
||||
args.train_dataset_name,
|
||||
subset_name=args.train_subset_name,
|
||||
split=args.train_split,
|
||||
namespace=args.train_dataset_namespace)
|
||||
namespace=args.train_dataset_namespace).to_hf_dataset()
|
||||
validation_dataset = MsDataset.load(
|
||||
args.val_dataset_name,
|
||||
subset_name=args.val_subset_name,
|
||||
split=args.val_split,
|
||||
namespace=args.val_dataset_namespace)
|
||||
namespace=args.val_dataset_namespace).to_hf_dataset()
|
||||
else:
|
||||
train_dataset, validation_dataset = build_dataset_from_file(
|
||||
args.dataset_json_file)
|
||||
@@ -364,14 +364,14 @@ def preprocess_function_train(examples):
|
||||
return model_inputs
|
||||
|
||||
|
||||
train_dataset = train_dataset.to_hf_dataset().map(
|
||||
train_dataset = train_dataset.map(
|
||||
preprocess_function_train,
|
||||
batched=True,
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
desc='Running tokenizer on train dataset',
|
||||
)
|
||||
|
||||
validation_dataset = validation_dataset.to_hf_dataset().map(
|
||||
validation_dataset = validation_dataset.map(
|
||||
preprocess_function_eval,
|
||||
batched=True,
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
|
||||
Reference in New Issue
Block a user