Please provide a clear and concise description of what the question is.
感谢大佬在convert_dataset.py更新可以转换csv文件,不过还是希望在一个文件夹里批量转换json、jsonl,下面是我根据之前版本改动的一些。如果可以,希望大佬也能将csv文件进行批量处理的脚本完善一下!
import os
import argparse
from datasets import load_dataset
def process_alpaca(examples):
convs = []
for instruction, inp, output in zip(examples['instruction'], examples['input'], examples['output']):
if len(inp.strip()) > 1:
instruction = instruction + '\n\n' + inp
q = instruction
a = output
convs.append([
{"from": "human", "value": q},
{"from": "gpt", "value": a}
])
return {"conversations": convs}
def convert_files_in_folder(input_folder, output_folder, data_type='alpaca'):
# 获取输入文件夹中的所有JSON\JSONL文件
input_files = [os.path.join(input_folder, file) for file in os.listdir(input_folder) if file.split('.')[-1] in ['json','jsonl'] ]
for input_file in input_files:
output_file = os.path.join(output_folder, f"sharegpt_{os.path.basename(input_file)}")
# 读取当前JSON文件
raw_datasets = load_dataset('json', data_files={"train": input_file})
ds = raw_datasets['train']
if data_type in ['alpaca']:
ds = ds.map(process_alpaca, batched=True, remove_columns=ds.column_names, desc="Running process")
else:
# Other sharegpt dataset, need rename to conversations and remove unused columns
if "items" in ds.column_names:
ds = ds.rename(columns={"items": "conversations"})
columns_to_remove = ds.column_names.copy()
columns_to_remove.remove('conversations')
ds = ds.remove_columns(columns_to_remove)
# 将处理后的数据保存到输出文件
ds.to_json(output_file, lines=True, force_ascii=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder", type=str)
parser.add_argument("--output_folder", type=str)
parser.add_argument("--data_type", type=str, default='alpaca')
args = parser.parse_args()
print(args)
Describe the Question
Please provide a clear and concise description of what the question is. 感谢大佬在convert_dataset.py更新可以转换csv文件,不过还是希望在一个文件夹里批量转换json、jsonl,下面是我根据之前版本改动的一些。如果可以,希望大佬也能将csv文件进行批量处理的脚本完善一下!