util: dynamic pupeteer scraper

Drew-Macgibbon commented 1 year ago

Features:

error handling for if the website html changes
destructure data to fit a generic format
2 second timeout before pagination

import { Page } from 'puppeteer'

interface Post {
  title: {
    name: string | null
    link: string | null
  }
  author: {
    name: string | null
    link: string | null
  }
  published: {
    date: string | null
    link: string | null
  }
  category: {
    name: string | null
    link: string | null
  }
  content: string | null
}

type ScrapeFunction = (page: Page) => Promise<Post[]>

const scrapeWebb: ScrapeFunction = async (page: Page) => {
  let posts: Post[] = []

  while (true) {
    try {
      const newPosts = await page.$$eval('article', (articles: Element[]) =>
        articles.map((article: Element) => {
          const titleNode = article.querySelector('.entry-title')
          const authorNode = article.querySelector('.entry-footer .author a')
          const publishedNode = article.querySelector('.entry-footer .posted-on time')
          const categoryNode = article.querySelector('.entry-footer .cat-links a')
          const contentNode = article.querySelector('.entry-content')

          return {
            title: {
              name: titleNode?.textContent?.trim() || null,
              link: titleNode?.getAttribute('href') || null
            },
            author: {
              name: authorNode?.textContent?.trim() || null,
              link: authorNode?.getAttribute('href') || null
            },
            published: {
              date: publishedNode?.textContent?.trim() || null,
              link: publishedNode?.getAttribute('href') || null
            },
            category: {
              name: categoryNode?.textContent?.trim() || null,
              link: categoryNode?.getAttribute('href') || null
            },
            content: contentNode?.innerHTML || null
          }
        })
      )

      posts = [...posts, ...newPosts]

      const nextPageLink = await page.$$eval('.nav-links .next', (nodes) =>
        nodes.length ? nodes[0].getAttribute('href') : null
      )

      if (!nextPageLink) {
        console.log(`scrapeWebb: last page ${posts.length}`)
        break
      }

      console.log('scrapeWebb: next page')
      await new Promise((resolve) => setTimeout(resolve, 2000))
      await page.goto(nextPageLink)
    } catch (error: any) {
      console.error(`scrapeWebb: error scraping page - ${error.message}`)
      break
    }
  }

  return posts
}

export default scrapeWebb

Drew-Macgibbon commented 1 year ago

Refactor

Made the function reusable, much more maintainable now.

import { Browser, Page } from 'puppeteer'

interface SelectorConfig {
  [key: string]: string
}

interface Blog {
  name: string
  url: string
  selectorConfig: SelectorConfig
}

interface ScrapeFunction {
  (browser: Browser, blog: Blog): Promise<any[]>
}

const scraperGeneric: ScrapeFunction = async (browser: Browser, blog: Blog) => {
  console.log(`genericScraper: scrape ${blog.name}`)
  let posts: any[] = []

  const page: Page = await browser.newPage()
  await page.goto(blog.url)

  while (true) {
    try {
      const newPosts = await page.$$eval(
        'article',
        (articles: HTMLElement[], selectorConfig: SelectorConfig) =>
          articles.map((article: HTMLElement) => {
            const data: { [key: string]: any } = {}

            for (const key in selectorConfig) {
              const node = article.querySelector(selectorConfig[key])
              if (!node) {
                if (key === 'featured_image') {
                  data[key] = null // set featured_image to null if not found
                  continue
                }
                throw new Error(`Missing ${key} node in article`)
              }

              if (key === 'content') {
                data[key] = node.textContent?.replace(/\n/g, ' ').trim()
              } else if (key === 'featured_image') {
                data[key] = node.getAttribute('src')
              } else {
                data[key] = {
                  name: node.textContent?.trim(),
                  link: node.getAttribute('href')
                }
              }
            }

            return data
          }),
        blog.selectorConfig
      )

      posts = [...posts, ...newPosts]

      const nextPageLink = await page.$$eval('.nav-links .next', (nodes) =>
        nodes.length ? nodes[0].getAttribute('href') : null
      )

      if (!nextPageLink) {
        console.log(`genericScraper: last page ${posts.length}`)
        break
      }

      console.log('genericScraper: next page')
      await new Promise((resolve) => setTimeout(resolve, 2000))
      await page.goto(nextPageLink)
    } catch (error: any) {
      console.error(`genericScraper: error scraping page - ${error.message}`)
      break
    }
  }

  return posts
}

export default scraperGeneric

we can now just set the selectors with the blogs we want to scrape

interface Blog {
  name: string
  url: string
  selectorConfig: {
    title: string
    author: string
    published: string
    category: string
    content: string
    featured_image: string
  }
}

const scraperBlogs: Blog[] = [
  {
    name: 'jwst-nasa-blog',
    url: 'https://blogs.nasa.gov/webb/',
    selectorConfig: {
      title: '.entry-title a',
      author: '.entry-footer .author a',
      published: '.entry-footer .posted-on a',
      category: '.entry-footer .cat-links a',
      content: '.entry-content',
      featured_image: '.entry-content img'
    }
  }
  // Add more blogs as needed
]

export default scraperBlogs

Drew-Macgibbon commented 1 year ago

Improvements:

[x] add necessary comments
[x] make base/pagination selector dynamic
[x] if a featured_image is found, return the caption and alt or null if unavailable
[x] remove all captions from content

Drew-Macgibbon commented 1 year ago

this was refactored into one function, reference #72

incubrain / astrotribe

util: dynamic pupeteer scraper #73

Features:

Refactor

Improvements: