duty-machine / duty-machine-action

抓取网络文章到github issue保存的github action
68 stars 54 forks source link

抓取掘金文章失败 #21

Open fakeyanss opened 2 years ago

fakeyanss commented 2 years ago

加了juejin.js,在本地debug一直失败,不清楚document.querySelector('h1.article-title')为null的原因

npm run test-website juejin

> duty-machine-action@0.0.1 test-website
> node test.js test-website "juejin"

null
/Users/fakeyanss/project/duty-machine-action/websites/juejin.js:21
    let title = document.querySelector('h1.article-title').textContent
                                                          ^

TypeError: Cannot read properties of null (reading 'textContent')
    at Object.process (/Users/fakeyanss/project/duty-machine-action/websites/juejin.js:21:59)
    at processTicksAndRejections (node:internal/process/task_queues:96:5)
    at async fetchArticle (/Users/fakeyanss/project/duty-machine-action/src/fetchArticle.js:22:19)
    at async /Users/fakeyanss/project/duty-machine-action/test.js:25:21

Node.js v17.4.0

以下是juejin.js

let { URL } = require('url')
let fetch = require('node-fetch')
let { JSDOM } = require('jsdom')

module.exports = {
  test(url) {
    let parsed = new URL(url)
    return parsed.hostname == 'juejin.cn'
  },

  async process(url) {
    let res = await fetch(url, {
      headers: {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:74.0) Gecko/20100101 Firefox/74.0'
      }
    })
    let html = await res.text()
    let document = new JSDOM(html).window.document

    console.log(document.querySelector('h1.article-title'))
    let title = document.querySelector('h1.article-title').textContent
    let author = document.querySelector('.name').textContent
    let content = document.querySelector('.markdown-body')

    return {
      title,
      author,
      dom: content
    }

  },

  samples: [
    'https://juejin.cn/post/6844903975678902279'
  ]
}