biodatlab / ocr-skooldio

OCR Notebooks and Lectures for Skooldio
MIT License
0 stars 3 forks source link

Create notebooks for using in the class #1

Open titipata opened 2 months ago

titipata commented 2 months ago

Here are some notebooks that we can prepare during the day.

titipata commented 1 month ago

ChatGPT 4o for information extraction

import os
import base64
import requests
from openai import OpenAI

# Set up the OpenAI API client
client = OpenAI(api_key="")

def encode_image(image_path):
    """Encode the image file to base64."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def ask_question_about_image(image_path, question):
    """Ask a question about the given image using OpenAI's GPT-4 Vision model."""

    # Encode the image
    base64_image = encode_image(image_path)

    # Prepare the payload for the API request
    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 300
    }

    # Make the API request
    response = client.chat.completions.create(**payload)

    # Extract and return the model's answer
    return response.choices[0].message.content
image_path = "ca_รายการจดทะเบียน_3.jpg"
question = """You are provided with a scanned or photographed image of a Thai vehicle registration book (สมุดทะเบียนรถ). Your task is to extract the following information from the image.
The extracted value is typically located on the right side of the key in the document.
Extract these details:

1. วันจดทะเบียน (date_of_registration)
2. เลขทะเบียน (registration_no)
3. จังหวัด (car_province)
4. ประเภท (vehicle_use)
5. รย (type)
6. ลักษณะ (body_style)
7. ยี่ห้อรถ (manufacturer)
8. แบบ (model)
9. รุ่นปี คศ (year)
10. สี (color)
11. เลขตัวรถ (chassis_number)
12. อยู่ที่ (chassis_location)
13. ยี่ห้อเครื่องยนต์ (engine_manufacturer)
14. เลขเครื่องยนต์ (engine_number)
15. อยู่ที่ (engine_location)
16. เชื้อเพลิง (fuel_type)
17. เลขถังแก๊ส (fuel_tank_number)
18. จำนวน (cylinders)
19. ซีซี (cubic_capacity)
20. แรงม้า (horse_power)
21. จำนวนเพลาและล้อ (axles_wheels_no)
22. น้ำหนักรถ (unladen_weight)
23. น้ำหนักบรรทุก/น้ำหนักเพลา (load_capacity)
24. น้ำหนักรวม (gross_weight)
25. ที่นั่ง (seats)

Instructions:

Carefully examine the image and locate each piece of information.
If a particular field is not visible or not present in the image, use the value "N/A" for that field.
Ensure all text extracted from the image is in its original language (Thai or English) as it appears in the document.
Return the extracted information in a JSON format, using the English key names provided in parentheses.
Only return the JSON output, without any additional explanation or text.

Example of expected output format:
{
  "date_of_registration": "1 ม.ค. 2566",
  "registration_no": "กข 1234",
  "car_province": "กรุงเทพมหานคร",
  ...
  "seats": "4"
}
"""

answer = ask_question_about_image(image_path, question)
print(f"Question: {question}")
print(f"Answer: {answer}")
titipata commented 3 weeks ago

Evaluation

import numpy as np
import pandas as pd
import jiwer

annotated_df = pd.read_excel('annotated_results.xlsx', dtype=str).fillna("")
predicted_df = pd.read_excel('predicted_results.xlsx', dtype=str).fillna("")

merged_df = pd.merge(annotated_df, predicted_df, on='image_path', suffixes=('_df1', '_df2'))
columns_of_interest = [
    'date_of_registration', 'registration_no', 'car_province', 'vehicle_use', 'type', 'body_style',
    'manufacturer', 'model', 'year', 'color', 'chassis_number', 'chassis_location', 'engine_manufacturer',
    'engine_number', 'engine_location', 'fuel_type', 'fuel_tank_number', 'cylinders', 'cubic_capacity',
    'horse_power', 'axles_wheels_no', 'unladen_weight', 'load_capacity', 'gross_weight', 'seats'
]

# minor data post-processing
merged_df["year_df2"] = merged_df.year_df2.str.replace("ค.ศ.", "").str.replace("คศ.", "")

eval_list = []
for column in columns_of_interest:
    cer = []
    for _, row in merged_df.iterrows():
        cer, accuracy = [], []
        gt = str(row[f'{column}_df1'])
        pred = str(row[f'{column}_df2'])
        if len(gt) > 0:
            cer.append(jiwer.cer(gt, pred))
    cer = np.mean(cer)
    accuracy = (merged_df[f'{column}_df1'] == merged_df[f'{column}_df2']).mean() * 100
    eval_list.append({
        "column_name": column,
        "cer": cer,
        "accuracy": accuracy
    })
eval_df = pd.DataFrame(eval_list)

ตรวจสอบข้อมูลในแต่ละ key จาก prediction และ ground truth ได้ด้วยการดึง 2 หลักมาเปรียบเทียบกัน

# compare 2 columns
col = "year" # col = "date_of_registration"
merged_df[[f"{col}_df1", f"{col}_df2"]]
what-in-the-nim commented 2 hours ago

Progress tracking (For myself reminder 555):