hjyssg / ShiguReader

硬核宅宅资源管理器. Ultimate Manga Resource Manager
MIT License
393 stars 45 forks source link

crawler #97

Open hjyssg opened 3 years ago

hjyssg commented 3 years ago

const HCCrawler = require('headless-chrome-crawler');
const JSONLineExporter = require('headless-chrome-crawler/exporter/json-line');

const FILE_PATH = 'C:\\git\\examples\\result.csv';

const exporter = new JSONLineExporter({
   file: FILE_PATH,
   // fields: ['options', 'response.url'],
   fields: ['response.url'],
   // separator: '\n',
});

//todo 
//two crawler
//one for index page
//the other for detail page

(async () => {

  const crawler = await HCCrawler.launch({
    maxConcurrency: 1,
    // maxDepth: 2,  //Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links.
    exporter,

    evaluatePage: () => {
        return {
            title: $('title').text()
        }
    },
    onSuccess: result => {
      // console.log(`Screenshot is saved as ${PATH}${result.options.saveAs} for ${result.options.url}.`);
      debugger
      console.log(result);
    },
    onError: error => {
        console.error(error)
    },
    preRequest: options => {
      //return fale to skip url
      return true;
    },
  });

  await crawler.queue({
    url:'https://www.doujinshi.org/browse/circle/31071/', 
    screenshot: '31071.png',
    obeyRobotsTxt: false,
    delay: 1500
  });

  await crawler.onIdle();
  await crawler.close();

})();

```js
hjyssg commented 3 years ago

function parseLink(response){ const $ = cheerio.load(response.data);

const links =  $("a[title^='More']");
for(let ii = 0; ii < links.length; ii = ii + 2){
    const infoLink = links[ii];
    const authorLink = links[ii+1];

    let str = authorLink.textContent;
    let tokens = str.split("/")
    //mil (Xration)
    //Morinaga Milk / 森永みるく (Myao)

    if(tokens.length > 1){
        str = tokens[1];
    }
    const sep = /[ \.,\/#!$%\^&&\*;:{}=\-_`~()\[\]\–-、`~?!@@、。/『』「」;’:・|=+]/;
    const author = str.split(sep).filter(e => !!e);

}

}