Closed RainBowLuoCS closed 3 months ago
what can i say, playground folder is also lost.
Thank you for reaching out! I'd be happy to assist you with the ScienceQA dataset. To better understand the issue you're experiencing, could you please provide more details about the question you're trying to evaluate and the error information you're encountering?
Additionally, I've taken note of the evaluation code and necessary files available at https://huggingface.co/datasets/yifanzhang114/SMR/blob/main/eval.zip.
Thank you for your reply. I want to know how to change the path format of the downloaded ScienceQA data set image format sciencqa/train/id/image.png into the sciencqa/id.jpg format in the SMR file. I noticed that the id in the SMR It is not the corresponding ID in problem.json and cannot be converted to the corresponding format through simple ID conversion.
I need your help, Bro. A quick citation here !.
Here is the code how we process scienceqa. Just replace the image.png with id.jpg.
import os
import json
import random
from datasets import load_dataset
dataset = load_dataset("derek-thomas/ScienceQA")['train']
idx_to_labels = dict()
for i in range(10):
idx_to_labels[i] = f"({chr(ord('A') + i)})"
datas = []
def get_question_text(problem):
question = problem['question']
return question
def get_context_text(problem, use_caption):
txt_context = problem['hint']
context = txt_context.strip()
if context == "":
context = "N/A"
return context
def get_choice_text(probelm):
choices = probelm['choices']
choice_list = []
for i, c in enumerate(choices):
choice_list.append(f"({chr(ord('A') + i)}) {c}")
choice_txt = " ".join(choice_list)
#print(choice_txt)
return choice_txt
def get_answer(problem, options):
return options[problem['answer']]
def get_lecture_text(problem):
# \\n: GPT-3 can generate the lecture with more tokens.
lecture = problem['lecture'].replace("\n", "\\n")
return lecture
def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True):
input_format, output_format = format.split("-")
## Inputs
if input_format == "CQM":
input = f"Context: {context}\n{question}\nOptions: {choice}\n"
elif input_format == "QCM":
input = f"{question}\nContext: {context}\nOptions: {choice}\n"
# upper bound experiment
elif input_format == "QCML":
input = f"{question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
elif input_format == "QCME":
input = f"{question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
elif input_format == "QCMLE":
input = f"{question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
elif input_format == "QCLM":
input = f"{question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
elif input_format == "QCEM":
input = f"{question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
elif input_format == "QCLEM":
input = f"{question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
# Outputs
if test_example:
output = "Answer:"
elif output_format == 'A':
output = f"The answer is {answer}."
elif output_format == 'AL':
output = f"The answer is {answer}. BECAUSE: {solution}"
elif output_format == 'AE':
output = f"The answer is {answer}. BECAUSE: {lecture}"
elif output_format == 'ALE':
output = f"The answer is {answer}. BECAUSE: {lecture} {solution}"
elif output_format == 'AEL':
output = f"The answer is {answer}. BECAUSE: {solution} {lecture}"
elif output_format == 'LA':
output = f"{lecture} The answer is {answer}."
elif output_format == 'EA':
output = f"{solution} The answer is {answer}."
elif output_format == 'LEA':
output = f"{lecture} {solution} The answer is {answer}."
elif output_format == 'ELA':
output = f"{solution} {lecture} The answer is {answer}."
input = input.replace(" ", " ").strip()
output = output.replace(" ", " ").strip()
if output.endswith("BECAUSE:"):
output = output.replace("BECAUSE:", "").strip()
return input, output
def get_solution_text(problem):
# \\n: GPT-3 can generate the solution with more tokens
solution = problem['solution'].replace("\n", "\\n")
return solution
for idx, line in enumerate(dataset):
# print(line)
image = line['image']
# if image is None:
# continue
choices_str = ''
if 'choices' in line:
choices = line['choices']
random_choices = random.sample(range(len(choices)), k=len(choices))
for neg in random_choices:
if neg != line['answer']:
break
# choices_str = '\n'.join([f'({chr(ord("A") + i)}) {choice}' for i, choice in enumerate(choices)])
# chosen = idx_to_labels[line['answer']]
# rejected = idx_to_labels[neg]
chosen = choices[line['answer']]
rejected = choices[neg]
else:
chosen = line['answer']
question = get_question_text(line)
context = get_context_text(line, True)
choice = get_choice_text(line)
answer = get_answer(line, choices)
lecture = get_lecture_text(line)
solution = get_solution_text(line)
input, output = create_one_example('CQM-EA',
question,
context,
choice,
answer,
lecture,
solution,
test_example=False)
output_path = os.path.join('/mnt/workspace/xue.w/yf/data/scienceqa', f'{str(idx)}.jpg')
if image is not None and not os.path.exists(output_path):
image.save(output_path)
image_path = os.path.join('scienceqa', f'{str(idx)}.jpg')
data = {
'id': idx,
'dataset' : 'ScienceQA',
# 'image': image_path,
# 'conversations': [
# {'from': 'human', 'value': f"<image>\n{line['question']}"},
# {'from': 'gpt', 'value': chosen}
# ],
'conversations': [
{'from': 'human', 'value': f"<image>\n{input}"},
{'from': 'gpt', 'value': output}
],
'output_1': {'from': 'llava', 'value': chosen},
'output_2': {'from': 'llava', 'value': rejected},
'preference': 1,
}
if image is None:
datas.append(data)
elif os.path.exists(output_path):
data['image'] = image_path
datas.append(data)
else:
print(f"文件 {image_path} 不存在.")
output_file_path = 'playground/ScienceQA.json'
# 将数据保存到 JSON 文件
with open(output_file_path, 'w') as json_file:
json.dump(datas, json_file)
great work!
we can't use your data file for training, how to transform the scienceqa to the target format.