Open Drew-Macgibbon opened 1 year ago
import { Browser, Page } from 'puppeteer'
import { Blog } from './scraperBlogs'
// maps the name of a data field to a CSS selector string that can be used to find the corresponding element on the page.
interface SelectorConfig {
[key: string]: string
}
// take a puppeteer Browser instance and a Blog object, and return a promise that resolves to an array of scraped data.
interface ScrapeFunction {
(browser: Browser, blog: Blog): Promise<any[]>
}
const scraperGeneric: ScrapeFunction = async (browser: Browser, blog: Blog) => {
console.log(`genericScraper: scrape ${blog.name}`)
let posts: any[] = []
const page: Page = await browser.newPage()
await page.goto(blog.url)
// loop through all pages.
while (true) {
try {
// To have access you have to pass in data to the page.$$eval function.
const newPosts = await page.$$eval(
blog.selectorBase,
(articles: Element[], selectorConfig: SelectorConfig) =>
articles.map((article: Element) => {
const data: { [key: string]: any } = {}
// For each key in the selectorConfig, find the corresponding element
for (const key in selectorConfig) {
const element = article.querySelector(selectorConfig[key])
if (!element) {
if (key === 'image') {
data[key] = {
src: null,
alt: null,
caption: null
} // if no image, set null, no error
continue
}
throw new Error(`Missing ${key} element in article`)
}
if (key === 'content') {
const clonedElement = element.cloneNode(true) as Element
const figcaptions = clonedElement.querySelectorAll('figcaption')
figcaptions.forEach((figcaption) => figcaption.remove())
data[key] = clonedElement.textContent?.replace(/\n/g, ' ').trim()
} else if (key === 'image') {
const captionElement = element.querySelector('figcaption')
const imgElement = element.querySelector('img')
data[key] = {
src: imgElement ? imgElement.getAttribute('src') : null,
alt: imgElement ? imgElement.getAttribute('alt') : null,
caption: captionElement ? captionElement.textContent?.trim() : null
}
} else {
data[key] = {
name: element.textContent?.trim(),
link: element.getAttribute('href')
}
}
}
return data
}),
blog.selectorConfig
)
posts = [...posts, ...newPosts]
// Find the link to the next page.
const nextPageLink = await page.$$eval(blog.selectorPagination, (elements) =>
elements.length ? elements[0].getAttribute('href') : null
)
if (!nextPageLink) {
console.log(`genericScraper: last page ${posts.length}`)
break
}
console.log('genericScraper: next page')
await new Promise((resolve) => setTimeout(resolve, 2000))
await page.goto(nextPageLink)
} catch (error: any) {
console.error(`genericScraper: error scraping page - ${error.message}`)
break
}
}
return posts
}
export default scraperGeneric
This function takes an input of blogs
export interface Blog {
name: string
url: string
selectorBase: string
selectorPagination: string
selectorConfig: {
title: string
author: string
published: string
category: string
content: string
image: string
}
}
const scraperBlogs: Blog[] = [
{
name: 'jwst-nasa-blog',
url: 'https://blogs.nasa.gov/webb/',
selectorBase: 'article',
selectorPagination: '.nav-links .next',
selectorConfig: {
title: '.entry-title a',
author: '.entry-footer .author a',
published: '.entry-footer .posted-on a',
category: '.entry-footer .cat-links a',
content: '.entry-content',
image: '.entry-content figure'
}
}
// Add more blogs as needed
]
export default scraperBlogs
What we want:
blogs
as input, cycles through them, and scrapes them.I'll try get something working using
puppeteer
for a single blog.I can use Vercel cron jobs for this: https://vercel.com/docs/cron-jobs or
node-cron