yfzhang114 / SliME

✨✨Beyond LLaVA-HD: Diving into High-Resolution Large Multimodal Models
Apache License 2.0
139 stars 7 forks source link

ScienceQA Format Error! #6

Closed RainBowLuoCS closed 3 months ago

RainBowLuoCS commented 4 months ago

we can't use your data file for training, how to transform the scienceqa to the target format.

RainBowLuoCS commented 4 months ago

what can i say, playground folder is also lost.

yfzhang114 commented 4 months ago

Thank you for reaching out! I'd be happy to assist you with the ScienceQA dataset. To better understand the issue you're experiencing, could you please provide more details about the question you're trying to evaluate and the error information you're encountering?

Additionally, I've taken note of the evaluation code and necessary files available at https://huggingface.co/datasets/yifanzhang114/SMR/blob/main/eval.zip.

RainBowLuoCS commented 4 months ago

Thank you for your reply. I want to know how to change the path format of the downloaded ScienceQA data set image format sciencqa/train/id/image.png into the sciencqa/id.jpg format in the SMR file. I noticed that the id in the SMR It is not the corresponding ID in problem.json and cannot be converted to the corresponding format through simple ID conversion.

RainBowLuoCS commented 4 months ago

I need your help, Bro. A quick citation here !.

yfzhang114 commented 4 months ago

Here is the code how we process scienceqa. Just replace the image.png with id.jpg.

import os
import json
import random
from datasets import load_dataset

dataset = load_dataset("derek-thomas/ScienceQA")['train']

idx_to_labels = dict()
for i in range(10):
    idx_to_labels[i] = f"({chr(ord('A') + i)})"
datas = []

def get_question_text(problem):
    question = problem['question']
    return question

def get_context_text(problem, use_caption):
    txt_context = problem['hint']
    context = txt_context.strip()
    if context == "":
        context = "N/A"
    return context

def get_choice_text(probelm):
    choices = probelm['choices']
    choice_list = []
    for i, c in enumerate(choices):
        choice_list.append(f"({chr(ord('A') + i)}) {c}")
    choice_txt = " ".join(choice_list)
    #print(choice_txt)
    return choice_txt

def get_answer(problem, options):
    return options[problem['answer']]

def get_lecture_text(problem):
    # \\n: GPT-3 can generate the lecture with more tokens.
    lecture = problem['lecture'].replace("\n", "\\n")
    return lecture

def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True):

    input_format, output_format = format.split("-")

    ## Inputs
    if input_format == "CQM":
        input = f"Context: {context}\n{question}\nOptions: {choice}\n"
    elif input_format == "QCM":
        input = f"{question}\nContext: {context}\nOptions: {choice}\n"
    # upper bound experiment
    elif input_format == "QCML":
        input = f"{question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
    elif input_format == "QCME":
        input = f"{question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
    elif input_format == "QCMLE":
        input = f"{question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"

    elif input_format == "QCLM":
        input = f"{question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
    elif input_format == "QCEM":
        input = f"{question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
    elif input_format == "QCLEM":
        input = f"{question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"

    # Outputs
    if test_example:
        output = "Answer:"
    elif output_format == 'A':
        output = f"The answer is {answer}."

    elif output_format == 'AL':
        output = f"The answer is {answer}. BECAUSE: {solution}"
    elif output_format == 'AE':
        output = f"The answer is {answer}. BECAUSE: {lecture}"
    elif output_format == 'ALE':
        output = f"The answer is {answer}. BECAUSE: {lecture} {solution}"
    elif output_format == 'AEL':
        output = f"The answer is {answer}. BECAUSE: {solution} {lecture}"

    elif output_format == 'LA':
        output = f"{lecture} The answer is {answer}."
    elif output_format == 'EA':
        output = f"{solution} The answer is {answer}."
    elif output_format == 'LEA':
        output = f"{lecture} {solution} The answer is {answer}."
    elif output_format == 'ELA':
        output = f"{solution} {lecture} The answer is {answer}."

    input = input.replace("  ", " ").strip()
    output = output.replace("  ", " ").strip()
    if output.endswith("BECAUSE:"):
        output = output.replace("BECAUSE:", "").strip()
    return input, output

def get_solution_text(problem):
    # \\n: GPT-3 can generate the solution with more tokens
    solution = problem['solution'].replace("\n", "\\n")
    return solution

for idx, line in enumerate(dataset):
    # print(line)
    image = line['image']
    # if image is None:
    #     continue
    choices_str = ''
    if 'choices' in line:
        choices = line['choices']
        random_choices = random.sample(range(len(choices)), k=len(choices))

        for neg in random_choices:
            if neg != line['answer']:
                break
        # choices_str = '\n'.join([f'({chr(ord("A") + i)}) {choice}' for i, choice in enumerate(choices)])
        # chosen = idx_to_labels[line['answer']]
        # rejected = idx_to_labels[neg]
        chosen = choices[line['answer']]
        rejected = choices[neg]
    else:
        chosen = line['answer']

    question = get_question_text(line)
    context = get_context_text(line, True)
    choice = get_choice_text(line)
    answer = get_answer(line, choices)
    lecture = get_lecture_text(line)
    solution = get_solution_text(line)

    input, output = create_one_example('CQM-EA',
                                        question,
                                        context,
                                        choice,
                                        answer,
                                        lecture,
                                        solution,
                                        test_example=False)
    output_path = os.path.join('/mnt/workspace/xue.w/yf/data/scienceqa', f'{str(idx)}.jpg')
    if image is not None and not os.path.exists(output_path):
        image.save(output_path)
    image_path = os.path.join('scienceqa', f'{str(idx)}.jpg')

    data = {
            'id': idx,
            'dataset' : 'ScienceQA',
            # 'image': image_path,  
            # 'conversations': [
            #     {'from': 'human', 'value': f"<image>\n{line['question']}"},
            #     {'from': 'gpt', 'value': chosen}
            # ],
            'conversations': [
                {'from': 'human', 'value': f"<image>\n{input}"},
                {'from': 'gpt', 'value': output}
            ],
            'output_1': {'from': 'llava', 'value': chosen},
            'output_2': {'from': 'llava', 'value': rejected},
            'preference': 1,
        }

    if image is None:
        datas.append(data)
    elif os.path.exists(output_path):
        data['image'] = image_path
        datas.append(data)
    else:
        print(f"文件 {image_path} 不存在.")

output_file_path = 'playground/ScienceQA.json'

# 将数据保存到 JSON 文件
with open(output_file_path, 'w') as json_file:
    json.dump(datas, json_file)
RainBowLuoCS commented 4 months ago

great work!