Python3WebSpider / Jiepai

Jiepai Pictures of Toutiao
124 stars 142 forks source link

2019/11/4,可爬取示例 #23

Open liyunlongaaa opened 5 years ago

liyunlongaaa commented 5 years ago
import os
from hashlib import md5
from multiprocessing.pool import Pool
import requests
from urllib.parse import urlencode

GROUP_STRAT = 1
GROUP_END = 10

URL = 'https://www.toutiao.com/api/search/content/?'

def get_page(offset):
    headers = {
        'cookie': 'tt_webid=6755317032361018894; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=iix0i88fv1572844826803; tt_webid=6755317032361018894; s_v_web_id=9e3d1341a0ce3a6625dafa55bd50a8c5; csrftoken=4e5d24d1aa648714a20fa478adb67e3c; sso_uid_tt=9989002b18278fa8a91a2d9bfceff0db; toutiao_sso_user=22695687fb520aba3bba92d81182e09d; sid_guard=77ee6776e543023f76e7fb53e187aa0b%7C1572844883%7C5184000%7CFri%2C+03-Jan-2020+05%3A21%3A23+GMT; uid_tt=6bb53dbf3ec71913bc04a6c4a27ac973; sid_tt=77ee6776e543023f76e7fb53e187aa0b; sessionid=77ee6776e543023f76e7fb53e187aa0b',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest',
        'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
    }
    params = {
        'aid': 24,
        'app_name': 'web_search',
        'offset': offset,
        'format': 'json',
        'keyword': '街拍美图',
        'autoload': 'true',
        'count': '20',
        'en_qc': '1',
        'cur_tab': '1',
        'from': 'search_tab',
        'pd': 'synthesis',
        'timestamp': '1572845169181'
    }
    real_url = URL + urlencode(params)
    print(real_url)
    try:
        response = requests.get(real_url,headers = headers)
        response.raise_for_status()
        #print(response.status_code)
        #print(response.json())
        return response.json()
    except:
        print('请求失败')

def get_images(json):
    if json.get('data'):
        items = json.get('data')
        try:
            for item in items:
                title = item.get('title')
                images = item.get('image_list')
                for image in images:
                    yield {
                        'title': title,
                        'image': image.get('url')
                    }
        except TypeError:
            print("images返回空,未知错误,爬取下一页")
    else:
        print('data为空')
def save_image(item):
    if not os.path.exists("今日头条街拍"):
        os.makedirs("今日头条街拍" )
    if not os.path.exists("今日头条街拍" + '/' + item.get('title')):
        os.makedirs("今日头条街拍" + '/' + item.get('title'))
    try:
        print(item.get('image'))
        response = requests.get(item.get('image'))
        response.raise_for_status()
        file_path = '{0}/{1}.{2}'.format("今日头条街拍" + '/' + item.get('title'), md5(response.content).hexdigest(), 'jpg')
        if not os.path.exists(file_path):
            with open(file_path,'wb') as f:
                print('正在下载图片...')
                f.write(response.content)
                print('下载成功!')
        else:
            print('Already Download',file_path)
    except:
        print('图片链接异常,保存出错')

def main(offset):
    print(offset)
    json = get_page(offset)
    for item in get_images(json):
        print(item)
        #print(type(item))
        save_image(item)

if __name__ == '__main__':
    print('hello')
    # for offset in range(GROUP_STRAT+1,GROUP_END+1):
    #     print(20 * offset)
    #     main(20 * offset)

    pool = Pool()
    pool.map(main, [offset * 20 for offset in range(GROUP_STRAT,GROUP_END+1)])
    pool.close()
    pool.join()