Open OliverHate opened 6 years ago
需要从浏览器中拷贝cookie 否则只加载基本网页结构
import requests import re import json import os from urllib.parse import urlencode from requests import codes from multiprocessing.pool import Pool heardes = { "user-agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 69.0.3497.100Safari / 537.36", "cookie": 'tt_webid=6591674770407851527; UM_distinctid=16555dd135182a-0aaf457dfee96f-2711938-144000-16555dd13525c0; csrftoken=e874c1c4c92de13796f658055b321509; tt_webid=6591674770407851527; WEATHER_CITY=%E5%8C%97%E4%BA%AC; uuid="w:69e07afc4113469c8a67bc7e1191a8fe"; ccid=5b05c00c464a23364e612a873a8f8bf8; CNZZDATA1259612802=1586168555-1534743351-%7C1539254542; __tasessionId=y296rn1i01539258641108', "x-requested-with": "XMLHttpRequest", } def get_soucre_code(offset): params = { "offset": offset, "format": "json", "keyword": "街拍", "autoload": "true", "count": 20, "cur_tab": 3, "from": "gallery", } base_url = "https://www.toutiao.com/search_content/?" url = base_url + urlencode(params) try: resp = requests.get(url,headers=heardes) if codes.ok == resp.status_code: return resp except requests.ConnectionError: return None def get_images(json): if json.get('data'): data = json.get('data') for item in data: title = item.get('title') open_url = item.get('open_url') yield title,open_url def get_images_url(open_url,title): try: image_source_code = requests.get('https://www.toutiao.com{}'.format(open_url),headers=heardes) pic_json_reg =re.compile("gallery: JSON.parse(.*?)siblingList:",re.S) pic_url_reg = re.compile('"url":"(.*?)"') urls = pic_json_reg.search(image_source_code.text) urls = urls.group(0) urls = urls.replace('\\','') pic_urls = pic_url_reg.findall(urls) if pic_urls: for num in range(0,len(pic_urls),4): save_image(pic_urls[num],title,num) print(pic_urls[num]) except requests.ConnectionError: return None def save_image(pic_url,title,num): path = './data/{title}/'.format(title=title) if not os.path.exists('data'): os.mkdir('data') if not os.path.exists(path): os.mkdir(path) try: pic = requests.get(pic_url,headers=heardes) with open('data/{title}/{num}.jpg'.format(title=title,num=num),mode='wb') as f: f.write(pic.content) print('{title}/{num}.jpg 写入成功'.format(title=title,num=num)) except requests.ConnectionError: return None def main(page): json = get_soucre_code(page) json = json.json() for title,open_url in get_images(json): get_images_url(open_url,title) if __name__ == '__main__': pool = Pool() groups = [x * 20 for x in range(1,11)] pool.map(main,groups) pool.close() pool.join()
需要从浏览器中拷贝cookie 否则只加载基本网页结构