Python3WebSpider / Jiepai

Jiepai Pictures of Toutiao
124 stars 142 forks source link

设置cookie爬取街拍组图内所有图片 #8

Open OliverHate opened 6 years ago

OliverHate commented 6 years ago

需要从浏览器中拷贝cookie 否则只加载基本网页结构

import requests
import re
import json
import os
from urllib.parse import urlencode
from requests import codes
from multiprocessing.pool import Pool
heardes = {
        "user-agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 69.0.3497.100Safari / 537.36",
        "cookie": 'tt_webid=6591674770407851527; UM_distinctid=16555dd135182a-0aaf457dfee96f-2711938-144000-16555dd13525c0; csrftoken=e874c1c4c92de13796f658055b321509; tt_webid=6591674770407851527; WEATHER_CITY=%E5%8C%97%E4%BA%AC; uuid="w:69e07afc4113469c8a67bc7e1191a8fe"; ccid=5b05c00c464a23364e612a873a8f8bf8; CNZZDATA1259612802=1586168555-1534743351-%7C1539254542; __tasessionId=y296rn1i01539258641108',
        "x-requested-with": "XMLHttpRequest",

    }
def get_soucre_code(offset):
    params = {
        "offset": offset,
        "format": "json",
        "keyword": "街拍",
        "autoload": "true",
        "count": 20,
        "cur_tab": 3,
        "from": "gallery",
    }
    base_url = "https://www.toutiao.com/search_content/?"
    url = base_url + urlencode(params)
    try:
        resp = requests.get(url,headers=heardes)
        if codes.ok == resp.status_code:
            return resp
    except requests.ConnectionError:
        return None
def get_images(json):
    if json.get('data'):
        data = json.get('data')
        for item in data:
            title = item.get('title')
            open_url = item.get('open_url')
            yield title,open_url

def get_images_url(open_url,title):
    try:
        image_source_code = requests.get('https://www.toutiao.com{}'.format(open_url),headers=heardes)
        pic_json_reg =re.compile("gallery: JSON.parse(.*?)siblingList:",re.S)
        pic_url_reg = re.compile('"url":"(.*?)"')
        urls = pic_json_reg.search(image_source_code.text)
        urls = urls.group(0)
        urls = urls.replace('\\','')
        pic_urls = pic_url_reg.findall(urls)
        if pic_urls:
            for num in range(0,len(pic_urls),4):
                save_image(pic_urls[num],title,num)
                print(pic_urls[num])
    except requests.ConnectionError:
        return None

def save_image(pic_url,title,num):
    path = './data/{title}/'.format(title=title)
    if not os.path.exists('data'):
        os.mkdir('data')
    if not os.path.exists(path):
        os.mkdir(path)
    try:
        pic = requests.get(pic_url,headers=heardes)
        with open('data/{title}/{num}.jpg'.format(title=title,num=num),mode='wb') as f:
            f.write(pic.content)
            print('{title}/{num}.jpg 写入成功'.format(title=title,num=num))
    except requests.ConnectionError:
        return None

def main(page):
    json = get_soucre_code(page)
    json = json.json()
    for title,open_url in get_images(json):
        get_images_url(open_url,title)

if __name__ == '__main__':
    pool = Pool()
    groups = [x * 20 for x in range(1,11)]
    pool.map(main,groups)
    pool.close()
    pool.join()