alixaxel / chrome-aws-lambda

Chromium Binary for AWS Lambda and Google Cloud Functions
MIT License
3.17k stars 289 forks source link

[BUG] - Scrolling doesn't work in AWS Lambda #301

Open joaofaria97 opened 7 months ago

joaofaria97 commented 7 months ago

Environment

Expected Behavior

I currently have a page that needs to be scrolled down to fully load the elements I'm trying to scrape. When I run my scraping code locally, it scrolls fine and successfully loads all the elements.

Snapshot after local run where all elements are successfully loaded: 1701686880746-elementCount_424

Current Behavior

However, when I run this in lambda the scrolling does not work. The elements are not totally loaded (50 elements that are loaded in the beginning vs the 400+ that get loaded in when I run it locally. I've tried using different selectors as targets to scroll to but none seem to work.

Snapshot after lambda run after scrolling is called: 1701688595177-elementCount_50

Steps to Reproduce

URL: https://sports.bwin.pt/pt/sports/futebol-4/apostar

const AWS = require('aws-sdk')
const s3 = new AWS.S3({apiVersion: '2006-03-01'});
const chromium = require('chrome-aws-lambda');

const pageURL = process.env.TARGET_URL
const agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'

const Bwin = require('./scrapers/bwin.js')
const db = require('./db.js')

exports.handler = async (event, context) => {

  let result = null;
  let browser = null;

  try {
    browser = await chromium.puppeteer.launch({
      args: chromium.args,
      defaultViewport: chromium.defaultViewport,
      executablePath: await chromium.executablePath,
      headless: chromium.headless,
      ignoreHTTPSErrors: true,
    });

    await db.connectToDb()

    let page = await browser.newPage();

    await page.setUserAgent(agent)

    console.log('Navigating to page: ', pageURL)

    await page.goto(pageURL, { waitUntil: 'networkidle2'})

    let bwin = new Bwin()
    let events = await bwin.scrapeEvents(page)
    console.log('length: ', events.length)

    const buffer = await page.screenshot()
    // upload the image using the current timestamp as filename
    const s3result = await s3
      .upload({
        Bucket: 'mybucket',
        Key: `${Date.now()}-${events.length}.png`,
        Body: buffer,
        ContentType: 'image/png',
        ACL: 'public-read'
      })
      .promise()

    console.log('S3 image URL:', s3result.Location)
    console.log('URL: ', page.url())
    await page.close();
    await browser.close();

  } catch (error) {
    console.log(error)
  } finally {
    if (browser !== null) {
      await browser.close();
    }
  }

  return result
}
const db = require('../db.js')
const AWS = require('aws-sdk')
const s3 = new AWS.S3({apiVersion: '2006-03-01'});

class Scraper {
    async scrapeEvents(page) {
        await this.loadPage(page)

        page.on('console', msg => console.log('PAGE LOG:', msg.text()));

        await this.loadAllElements(page, this.eventSelector)
        let events = await page.$$eval(this.eventSelector, this.getEventInfo)
        events = events.map(event => this.parseEventInfo(event))
        return events
    }

    async loadPage(page) {
        await this.closePopUp(page);
        await this.loadElements(page, this.eventSelector, 0)
    }

    async loadElements(page, elementSelector, elementCount) {
        console.log('count: ', elementCount)

        const buffer = await page.screenshot()
        // upload the image using the current timestamp as filename
        const s3result = await s3
        .upload({
            Bucket: 'mybucket',
            Key: `${Date.now()}-elementCount:${elementCount}.png`,
            Body: buffer,
            ContentType: 'image/png',
            ACL: 'public-read'
        })
        .promise()

        console.log('S3 image URL:', s3result.Location)

        try {
            await page.waitForFunction((elementSelector, elementCount) => {
                return document.querySelectorAll(elementSelector).length != elementCount;
            }, { timeout: 30000 }, elementSelector, elementCount);
        } catch (error) {
            throw error
        }
    }

    async loadAllElements(page, elementSelector) {
        try {
            while(true) {
                let elementCount = await page.evaluate(this.scrollToBottom, this.scrollableSelector, elementSelector)
                await this.loadElements(page, elementSelector, elementCount)
            }
        } catch(error) {
            console.error(error)
        }
    }

    async closePopUp(page) {
        await this.loadElements(page, this.popupSelector, 0)
        await page.evaluate((sel) => document.querySelector(sel).click(), this.popupSelector)
        console.log('popup closed')
    }

    scrollToBottom(scrollableSelector, elementSelector) {
        let elementCount = document.querySelectorAll(elementSelector).length;
        document.querySelector(scrollableSelector).scrollIntoView({ behavior:"smooth", block: "end" })
        console.log('SCROLLED TO BOTTOM')
        return elementCount;
    }
}

module.exports = Scraper;
const Scraper = require('./scraper.js')

class Bwin extends Scraper {
    eventSelector = 'ms-event'
    popupSelector = 'button#onetrust-accept-btn-handler'
    // scrollableSelector = 'div#main-view'
    scrollableSelector = 'div.grid-footer'

    async getEventInfo(events) {
        return events.map(event => {
            return {
                home: event.querySelector('div.participant-wrapper:nth-child(1)').textContent.trim(),
                away: event.querySelector('div.participant-wrapper:nth-child(2)').textContent.trim(),
                date: event.querySelector('ms-event-timer').textContent.trim(),
                competition: event.closest('ms-event-group').querySelector('ms-league-header').textContent.trim()
            }
        })
    }

    parseEventInfo(event) {
        let home = event.home
        let away = event.away

        let [country, competition] = this.parseCompetition(event.competition)
        let date = this.parseDate(event.date)
        return {
            home,
            away,
            date,
            country,
            competition
        }
    }

    parseCompetition(competition) {
        let country
        [country, competition] = competition.split('|').map(str => str.trim())
        return [country, competition]
    }

    parseDate(dateStr) {
        try {
            let date = new Date()
            if (dateStr.includes('Hoje') || dateStr.includes('Amanhã')) {
              let [hour, minute] = dateStr.split('/')[1].trim().split(':').map(t => Number(t))
              date.setHours(hour, minute, 0, 0)

              if (dateStr.includes('Amanhã')) date.setDate(date.getDate() + 1)
            } else {
              let [datePart, timeStr] = dateStr.split(' ')
              let [day, month, year] = datePart.split('/').map(t => Number(t))
              let [hour, minute] = timeStr.trim().split(':').map(t => Number(t))

              date = new Date(year, --month, day, hour, minute, 0, 0)
            }
            return date
          } catch (error) {
            console.log(error)
          }
    }
}

module.exports = Bwin;

Has anyone experienced this?

joaofaria97 commented 7 months ago

Also tried to make the viewports equal, still doesn't scroll down