Turn websites into API - Githubissues

slevin48 commented 1 year ago

https://community.openai.com/t/turn-any-website-into-an-api-with-gpt-4/145689 https://www.kadoa.com/playground

DOM selectors

[
  {
    "name": "date",
    "description": "date",
    "selector": "tbody > .BdT > td:nth-child(1) span",
    "type": "TEXT"
  },
  {
    "name": "open",
    "description": "open",
    "selector": "tbody > .BdT > td:nth-child(2) span",
    "type": "TEXT"
  },
  {
    "name": "high",
    "description": "high",
    "selector": "tbody > .BdT > td:nth-child(3) span",
    "type": "TEXT"
  }
]

Extracted data

[
  {
    "date": "Apr 06, 2023",
    "open": "183.08",
    "high": "186.39"
  },
  {
    "date": "Apr 05, 2023",
    "open": "190.52",
    "high": "190.68"
  },
  {
    "date": "Apr 04, 2023",
    "open": "197.32",
    "high": "198.74"
  },
  {
    "date": "Apr 03, 2023",
    "open": "199.91",
    "high": "202.69"
  },
...
]

generated-scraper.py


import asyncio
from pyppeteer import launch
from typing import List, Dict

# Helper function to extract content based on type
async def get_content_for_type(element, page, selector: str, type_: str) -> str:
    val = ""
    text_ob = element
    try:
        text_ob = await element.querySelector(selector)
    except Exception as e:
        print("INFO: selector not valid - probably the parent object is the object")

    try:
        if type_ == 'TEXT':
            val = (await page.evaluate('(el) => el.textContent', text_ob)).strip()
        elif type_ == 'IMAGE':
            # Probably not complete - could also be in srcset or so....
            val = await page.evaluate('(el) => el.src', text_ob)
        elif type_ == 'LINK':
            # Probably not complete
            val = await page.evaluate('(el) => el.href', text_ob)

        return val
    except Exception as e:
        print("INFO: object not found", e)

# Function to generate a common selector
def generate_common_selector(selectors):
    arr = [s.replace(' > ', '> ').split(' ') for s in selectors]
    arr.sort()
    a1 = arr[0]
    a2 = arr[len(arr) - 1]
    L = len(a1)
    i = 0
    while i < L and a1[i] == a2[i]:
        i += 1
    return ' '.join([s.replace('>', ' >') for s in a1[:i]])

# Function to scrape data from the website
async def scrape_data(page, selectors: List[Dict]) -> List[Dict]:
    common_sub_path = generate_common_selector([s["selector"] for s in selectors])
    sub_selectors = [
        {**s, "selector": s["selector"].replace(common_sub_path, "").strip()} for s in selectors
    ]
    common_sub_path = common_sub_path[:-2] if common_sub_path.endswith('>') else common_sub_path

    elements = await page.querySelectorAll(common_sub_path) if common_sub_path else [page]

    scraped_data = []
    for element in elements:
        data = {}
        for selector in sub_selectors:
            data_point = await get_content_for_type(element, page, selector["selector"], selector["type"])
            if data_point:
                data[selector["name"]] = data_point
        scraped_data.append(data)

    return scraped_data

SELECTORS = [{"name":"date","description":"date","selector":"tbody > .BdT > td:nth-child(1) span","type":"TEXT"},{"name":"open","description":"open","selector":"tbody > .BdT > td:nth-child(2) span","type":"TEXT"},{"name":"high","description":"high","selector":"tbody > .BdT > td:nth-child(3) span","type":"TEXT"}];

LINK = "https://finance.yahoo.com/quote/TSLA/history?p=TSLA"

async def main():
    browser = await launch(
        headless=False,
        timeout=100000,
        ignoreDefaultArgs=["--enable-automation"],
        args=[],
        defaultViewport=None
    )

    page = await browser.newPage()

    await page.goto(LINK, waitUntil=["networkidle2"], timeout=15000)

    scraped_data = await scrape_data(page, SELECTORS)

    print(scraped_data)
    await browser.close()

asyncio.get_event_loop().run_until_complete(main())

slevin48 commented 1 year ago

https://jamesturk.net/posts/scraping-with-gpt-4/

import openai

html = requests.get(url)
completion = openai.ChatCompletion.create(
    engine="gpt-4",
    # this controls how long the JSON output can be, 
    # 2048 tokens is about 8,000 characters
    # which should be more than enough 
    # (note: this impacts the cost of the request)
    max_tokens=2048,
    # temperature controls how random the output is
    # 0 is completely deterministic
    # which is what we want for scraping
    temperature=0,
    # at the time of writing I only had GPT-4 
    # access via the chat interface
    messages=[
        {
            "text": 'Convert the given HTML to JSON with the schema' 
            '{"name": "string", "age": "number"}',
            "user": "system",
        },
        {
            "text": html.text,
            "user": "user",
        },
    ],
)
# extract JSON 
data = json.loads(completion.choices[0]["message"]["content"])

slevin48 commented 1 year ago

https://github.com/slevin48/immo-scraper

slevin48 / openai

Turn websites into API #19