berstend / puppeteer-extra

💯 Teach puppeteer new tricks through plugins.
https://extra.community
MIT License
6.23k stars 731 forks source link

[Bug] Puppeteer gets detected on cloudflare #888

Closed marcpre closed 2 months ago

marcpre commented 2 months ago

Describe the bug

When opening a cloudflare protected page I get this:

image

Code Snippet

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const path = require("path");
const fs = require("fs");
const winston = require("winston");
const DailyRotateFile = require("winston-daily-rotate-file");
const mysql = require("mysql");
const util = require("util");

// Apply the stealth plugin to avoid being detected as a bot
const puppeteerStealth = StealthPlugin();
puppeteerStealth.enabledEvasions.delete('user-agent-override');
puppeteer.use(puppeteerStealth);

require("dotenv").config({ path: path.resolve(__dirname, "../../.env") });

require("dotenv").config({ path: path.resolve(__dirname, "../.env") });

const connection = mysql.createConnection({
    host: process.env.DB_HOST,
    user: process.env.DB_USERNAME,
    password: process.env.DB_PASSWORD,
    database: process.env.DB_DATABASE,
    port: process.env.DB_PORT
});

connection.connect();

const appEnvironment = process.env.APP_ENV;

const imgDirectory = path.join(__dirname, "_img");
if (!fs.existsSync(imgDirectory)) {
    fs.mkdirSync(imgDirectory, { recursive: true });
}

const logDirectory = path.join(__dirname, "_logs");
if (!fs.existsSync(logDirectory)) {
    fs.mkdirSync(logDirectory, { recursive: true });
}
const logger = winston.createLogger({
    level: "info",
    format: winston.format.combine(
        winston.format.label({ label: "quietlight" }),
        winston.format.timestamp(),
        winston.format.printf(({ level, message, label, timestamp }) => {
            return `${timestamp} [${label}] ${level}: ${message}`;
        })
    ),
    transports: [
        new winston.transports.Console(),
        new DailyRotateFile({
            filename: path.join(logDirectory, "quietlight_%DATE%.log"),
            datePattern: "YYYY-MM-DD",
            zippedArchive: true,
            maxSize: "20m",
            maxFiles: "7d",
        }),
    ],
});

async function captureScreenshot(page, tag) {
    // Generate a dynamic filename based on the current time
    const timestamp = new Date().toISOString().replace(/[^0-9]/g, "");
    const filename = `screenshot_${tag}_${timestamp}.png`;
    const screenshotPath = path.join(imgDirectory, filename);

    // Try to capture and save the screenshot
    try {
        await page.screenshot({ path: screenshotPath, fullPage: true });
        logger.info(
            `Screenshot of the page has been captured and saved to ${screenshotPath}.`
        );
    } catch (error) {
        logger.error(
            "An error occurred while capturing a screenshot of the page:",
            error
        );
    }
}

// This function calculates the duration and logs it
function logDuration(startTime) {
    const endTime = new Date();
    const duration = (endTime - startTime) / 1000; // Duration in seconds
    logger.info(`Script execution time: ${duration} seconds`);
}

const startTime = new Date();

logger.info(`############# Starting Quietlight Scrapper #############`);

logger.info(`Running on Node.js ${process.version}`);
logger.info(`App Environment: ${appEnvironment}`);

(async () => {
    const browser = await puppeteer.launch({
        // headless: appEnvironment === "local" ? false : "new",
        headless: true, // THIS, if you want to see the browser
        args: [
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-dev-shm-usage',
            '--disable-accelerated-2d-canvas',
            '--disable-gpu',
            '--lang=en-US,en', // Set language explicitly
        ],
    });

    try {
        // const [page] = await browser.pages();
        const page = (await browser.pages())[0]; // <-- bypasses Cloudflare
        await page.setViewport({ width: 1366, height: 768 });
        await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36");
        await page.setExtraHTTPHeaders({
            'Accept-Language': 'en-US,en;q=0.9'
        });
        page.setDefaultNavigationTimeout(0);

        await page.goto("https://quietlight.com/", { waitUntil: 'networkidle2' });

        captureScreenshot(page, "quietlightPage")

        await page.waitForSelector("img");
        await page.waitForTimeout(10000);

    } catch (error) {
        captureScreenshot(page, "errorInMainLoop")
        logger.error("An error occurred during the script execution:", error);
    } finally {
        if (browser) {
            await browser.close();
        }
        connection.end();
        logDuration(startTime);
        logger.info(`############# End quietlight Scrapper #############`);
        process.exit();
    }
})();

Versions

This is my package.json:

{
  "name": "quietlight",
  "version": "1.0.0",
  "description": "",
  "main": "quietlight.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1",
    "start": "node quietlight.js"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "devDependencies": {
    "axios": "^1.4.0",
    "dotenv": "^16.4.5",
    "minimist": "^1.2.8",
    "mysql": "^2.18.1",
    "puppeteer": "^19.11.1",
    "puppeteer-extra": "^3.3.6",
    "puppeteer-extra-plugin-stealth": "^2.11.2",
    "winston": "^3.8.2",
    "winston-daily-rotate-file": "^4.7.1"
  },
  "dependencies": {
    "google-auth-library": "^9.6.3",
    "google-spreadsheet": "^4.1.1"
  }
}
marcpre commented 2 months ago

The issue was with my calling of the puppeteer stealth plugin in my script