AnnVoV / blog

24 stars 2 forks source link

async, await 练习- 存储关注专栏里面的前3篇文章 #22

Open AnnVoV opened 5 years ago

AnnVoV commented 5 years ago
const request = require('request-promise-native');
const cheerio = require('cheerio');
const config = require('../config');
const zhihuRoot = config.zhihu.root;
const pageSize = config.page.pageSize;
const ColumnModel = require('../model/column');
const ContentModel = require('../model/content');

const USER_NAME = 'anran-0423';

// db start
const {host, database, port} = config.db;
const mongoose = require('mongoose');
mongoose.Promise = global.Promise;
mongoose.connect(host, database, port);

const exploreColumns = async (offset, limit) => {
  // 获取我offset页开始的 每页limit的专栏数据
  const paramObj = [`offset=${offset}`, `limit=${limit}`].join('&');
  const options = {
    method: 'GET',
    uri: `https://www.zhihu.com/api/v4/members/${USER_NAME}/following-columns?${paramObj}`,
    json: true,
  };
  const rsData = await request(options);
  const promiseArr = rsData.data.map(async (column) => {
    return ColumnModel
            .findOneAndUpdate({id: column.id}, column, {upsert: true, new: true})
            .exec();
    // 存储专栏相关的数据 这里涉及到findOneAndUpdate 与 update 方法的区别
    // https://segmentfault.com/a/1190000009706886,
    // Mongoose: findOneAndUpdate doesn't return updated document
    // https://stackoverflow.com/questions/32811510/mongoose-findoneandupdate-doesnt-return-updated-document
  });
  return Promise.all(promiseArr);
};

const getArticledData = (column) => {
  // 获取专栏里的最新的一篇文章数据
  return new Promise((resolve, reject) => {
    const uri = `https://zhuanlan.zhihu.com/api2/columns/${column.id}/articles`;
    const options = {
      uri,
      json: true,
    };
    request(options)
    .then((res) => {
      // 取每个文章的前3个
      const result = res.data.slice(0, 3).map((arr) => {
        arr.columnId = column.id;
        return arr;
      });
      resolve(result);
    });
  });
};

const getPageSize = () => {
  return new Promise((resolve) => {
    // 获取关注的专栏的页码数
    request(`${zhihuRoot}/people/anran-0423/following/columns`)
    .then((res) => {
      const $ = cheerio.load(res);
      const jsonData = JSON.parse($('#js-initialData').html());
      const data = jsonData.initialState.entities.users;
      resolve(data[USER_NAME].followingColumnsCount);
    })
    .catch((err) => {
      console.log(err);
    });
  });
};

const saveArticles = (articleArr, column) => {
  const promiseArr = articleArr.map(async (article) => {
    article.columnId = column._id;
    ContentModel
                .update({id: article.id}, article, {upsert: true})
                .exec();
  });
  return Promise.all(promiseArr);
};

const init = async () => {
  const allNum = await getPageSize();
  const pageCount = Math.ceil(allNum / pageSize);
  let pageArr = Array.from(new Array(pageCount), (val, index) => index);

  pageArr = pageArr.map(async (cur) => {
    const startPage = cur * pageSize;
    const endPage = cur * pageSize + pageSize;
    const columns = await exploreColumns(startPage, endPage);
    const articleArrs = columns.map(async (column) => {
      const articleArr = await getArticledData(column);
      saveArticles(articleArr, column);
    });
    return Promise.all(articleArrs);
  });

  Promise.all(pageArr)
          .then(() => {
            console.log('抓取数据成功!');
          })
          .catch((err) => {
            console.log(err);
          });
};

init();

主要遇到的问题: Using async/await with a forEach loop https://stackoverflow.com/questions/37576685/using-async-await-with-a-foreach-loop