Open lsy641111 opened 1 year ago
我这里就以安庆晚报作为例子讲解
这类报纸的爬取的关键就是在我们原有的基础上要先获取到每天新闻的网址,所以要在爬取的开始加一步单独获取网址链接的部分,也就是我这里的 pre_fetch_url
。对不同的网站可能这里的代码会不一样,但是总的逻辑都是先获取网址链接,然后再沿用我们之前的模板(两个模板都可以),这里我就还是用CrawlSpider的模板,代码如下:
# -*- coding: utf-8 -*-
import json
import requests
import scrapy
import re
from newscrapy.items import NewscrapyItem
from scrapy import FormRequest
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urljoin
from urllib import parse
class mySpider(CrawlSpider):
name = "aqwb"
newspapers = '安庆晚报'
def pre_fetch_url(self):
url_month = 'http://aqdzb.aqnews.com.cn/epaper/read.do?m=getIssueByMonth'
payload = {
'newspaperId': '2',
'date': '2023-03-21',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Host': 'aqdzb.aqnews.com.cn',
'Referer': 'http://aqdzb.aqnews.com.cn/epaper/read.do?m=i&iid=11042&idate=2_2023-03-01',
'Cookie': 'JSESSIONID=6A67FCA5AD2C1CE5ADEEA6A066B815CA; __jsluid_h=dffec1e3d9a5600ded393128991632f7',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36 Edg/106.0.1370.47'
}
r = requests.post(url_month, data=payload, headers=headers)
return r.json()['data']
def start_requests(self):
paths = self.pre_fetch_url()
print(paths)
urls = [urljoin('http://aqdzb.aqnews.com.cn', item['path']) for item in paths]
for url in urls:
yield FormRequest(url, dont_filter=True)
rules = (
Rule(LinkExtractor(allow=('/epaper/read.do.+?eid=\d+&idate=.+?'))),
Rule(LinkExtractor(allow=('/epaper/read.do.+?sid=\d+&idate=.+?')), callback="parse_item")
)
def parse_item(self, response):
title = response.xpath("//p[@class='articleTitle']/text()")
title2 = response.xpath("//p[@class='articleTitle2']/text()")
title = '/'.join([title, title2])
content = response.xpath("//div[@class='articleContent']").xpath("string(.)").get()
url = response.url
date = re.findall("idate=\d+_(\d{4}-\d{2}-\d+?)", url)
if date:
date = date[0]
else:
date = ''
imgs = response.xpath("//div[@id='article_image']/img//@src").getall()
imgs = [parse.urljoin(url, imgurl) for imgurl in imgs]
html = response.text
item = NewscrapyItem()
item['title'] = title
item['content'] = content
item['date'] = date
item['imgs'] = imgs
item['url'] = response.url
item['newspaper'] = self.newspapers
# item['html'] = html
yield item
post
请求,传参是获取的日期,后续历史任务获取时可以改为每个月的最后一天,因为从这个请求的参数m=getIssureByMonth
可知它是每月为单位,所以用历史上每个月的最后一天传入。这里的newspaperId=2
,而安庆日报的Id应该是1,具体可以按照我讲的方法去查看安庆日报的这个访问的参数,这里就不多赘述了。
http://aqdzb.aqnews.com.cn/epaper/read.do?m=i&iid=10742&idate=1_2022-08-19