gildas-lormeau / single-file-cli

CLI tool for saving a faithful copy of a complete web page in a single HTML file (based on SingleFile)
GNU Affero General Public License v3.0
539 stars 57 forks source link

SingleFile CLI API unable to bulk download #100

Closed takingurstuff closed 3 days ago

takingurstuff commented 1 week ago

this script utilizes singlefile CLI's api to bulk download pages with the added assistance from puppeteer in case it needs to scroll or interact with the webpage in any way. The URLs file had 8 lines in it, each line being a url for downloading, but the script only visited about half of it:

import { initialize } from './single-file-cli-api.js'
import puppeteer from 'puppeteer-core'
import readline from 'readline'
import fs from 'fs/promises' // Using the Promise-based version of fs
;(async () => {
  let browser
  try {
    browser = await initialize()
    console.log(browser)
    var conn = browser.apiPort
    console.log(`Received browser debug URL: ${conn}`)
    console.log('launching browser')
  } catch (error) {
    console.error('Failed to launch browser:', error)
    process.exit(1)
  }
  await wait(10000)
  const pupurl = conn
  console.log('Puppeteer attempt URL: ' + pupurl)

  let pc
  try {
    pc = await puppeteer.connect({ browserURL: pupurl })
  } catch (error) {
    console.error('Failed to connect to Puppeteer:', error)
    process.exit(1)
  }

  let page
  try {
    page = await pc.newPage()
    await page.goto('https://bbs.quantclass.cn')
  } catch (error) {
    console.error('Failed to open page:', error)
    process.exit(1)
  }

  try {
    console.log('Waiting for User Login')
    await waitForEnter()
    console.log('User Logged In. Continuing...')
  } catch (error) {
    await browser.finish()
    console.error(error.message)
    process.exit(1)
  }

  let urls
  try {
    const data = await fs.readFile('./urls.txt', 'utf8')
    urls = data.split(/\n/)
  } catch (error) {
    console.error('Failed to read URLs file:', error)
    process.exit(1)
  }

  try {
    await browser.capture(urls)
  } catch (error) {
    console.error('Failed to capture page for URL:', urls, error)
  }
  console.log('download finished, cleaning up')
  await browser.finish()
})()

async function waitForEnter () {
  return new Promise((resolve, reject) => {
    const rl = readline.createInterface({
      input: process.stdin,
      output: process.stdout
    })

    rl.question(
      'Press Enter to confirm login (or any other key to exit)...',
      input => {
        rl.close()
        if (input.trim() === '') {
          resolve()
        } else {
          reject(new Error('User entered a value instead of pressing Enter.'))
        }
      }
    )
  })
}

function wait (ms) {
  return new Promise(resolve => {
    setTimeout(() => resolve(), ms)
  })
}

Ignore the wierd naming on some of the variables any suggestions on the code being buggy or optimizable are also welcome (considering I am only a beginner in javascript)