Open AnnVoV opened 5 years ago
const request = require('request-promise-native'); const cheerio = require('cheerio'); const config = require('../config'); const zhihuRoot = config.zhihu.root; const pageSize = config.page.pageSize; const ColumnModel = require('../model/column'); const ContentModel = require('../model/content'); const USER_NAME = 'anran-0423'; // db start const {host, database, port} = config.db; const mongoose = require('mongoose'); mongoose.Promise = global.Promise; mongoose.connect(host, database, port); const exploreColumns = async (offset, limit) => { // 获取我offset页开始的 每页limit的专栏数据 const paramObj = [`offset=${offset}`, `limit=${limit}`].join('&'); const options = { method: 'GET', uri: `https://www.zhihu.com/api/v4/members/${USER_NAME}/following-columns?${paramObj}`, json: true, }; const rsData = await request(options); const promiseArr = rsData.data.map(async (column) => { return ColumnModel .findOneAndUpdate({id: column.id}, column, {upsert: true, new: true}) .exec(); // 存储专栏相关的数据 这里涉及到findOneAndUpdate 与 update 方法的区别 // https://segmentfault.com/a/1190000009706886, // Mongoose: findOneAndUpdate doesn't return updated document // https://stackoverflow.com/questions/32811510/mongoose-findoneandupdate-doesnt-return-updated-document }); return Promise.all(promiseArr); }; const getArticledData = (column) => { // 获取专栏里的最新的一篇文章数据 return new Promise((resolve, reject) => { const uri = `https://zhuanlan.zhihu.com/api2/columns/${column.id}/articles`; const options = { uri, json: true, }; request(options) .then((res) => { // 取每个文章的前3个 const result = res.data.slice(0, 3).map((arr) => { arr.columnId = column.id; return arr; }); resolve(result); }); }); }; const getPageSize = () => { return new Promise((resolve) => { // 获取关注的专栏的页码数 request(`${zhihuRoot}/people/anran-0423/following/columns`) .then((res) => { const $ = cheerio.load(res); const jsonData = JSON.parse($('#js-initialData').html()); const data = jsonData.initialState.entities.users; resolve(data[USER_NAME].followingColumnsCount); }) .catch((err) => { console.log(err); }); }); }; const saveArticles = (articleArr, column) => { const promiseArr = articleArr.map(async (article) => { article.columnId = column._id; ContentModel .update({id: article.id}, article, {upsert: true}) .exec(); }); return Promise.all(promiseArr); }; const init = async () => { const allNum = await getPageSize(); const pageCount = Math.ceil(allNum / pageSize); let pageArr = Array.from(new Array(pageCount), (val, index) => index); pageArr = pageArr.map(async (cur) => { const startPage = cur * pageSize; const endPage = cur * pageSize + pageSize; const columns = await exploreColumns(startPage, endPage); const articleArrs = columns.map(async (column) => { const articleArr = await getArticledData(column); saveArticles(articleArr, column); }); return Promise.all(articleArrs); }); Promise.all(pageArr) .then(() => { console.log('抓取数据成功!'); }) .catch((err) => { console.log(err); }); }; init();
主要遇到的问题: Using async/await with a forEach loop https://stackoverflow.com/questions/37576685/using-async-await-with-a-foreach-loop
主要遇到的问题: Using async/await with a forEach loop https://stackoverflow.com/questions/37576685/using-async-await-with-a-foreach-loop