Open jcyrss opened 4 years ago
搜狐提供的接口获取数据。以股票上海机场(600009)为例,获取7月16到20号的日线数据。
各参数的含义为:
code:股票代码,以cn_开头,
start:起始时间,
end:截止时间,
stat:统计信息,为0的时候就不返回stat对应的值了,
order:排序方法(D表示降序排,A表示升序排),
period:数据周期(d表示日线,m表示月线,w表示周线)。
就可以返回这些数据,格式是json.如下:
[{
"status":0,
"hq":[
["2018-07-20","61.22","61.83","0.61","1.00%","61.22","62.69","57637","35856.55","0.53%"],
["2018-07-19","63.00","61.22","-1.54","-2.45%","60.27","63.19","61372","37656.60","0.56%"],
["2018-07-18","62.28","62.76","0.48","0.77%","61.75","63.80","48778","30629.53","0.45%"],
["2018-07-17","62.70","62.28","-0.25","-0.40%","61.16","62.70","48519","29986.43","0.44%"],
["2018-07-16","62.00","62.53","0.80","1.30%","62.00","64.30","76005","47872.05","0.70%"]
],
"code":"cn_600009",
"stat":["累计:","2018-07-16至2018-07-20","0.10","0.16%",60.27,64.3,292311,182001.16,"2.68%"]
}]
返回的数据以这条为例
"2018-07-20","61.22","61.83","0.61","1.00%","61.22","62.69","57637","35856.55","0.53%"
分别表示日期,开盘,收盘,涨跌,涨幅,最低,最高,成交量,成交额,换手。
import time,csv
import requests,re
class QCWY:
def __init__(self,keyword,city,maxpagenum):
self.keyword = keyword
self.city = city
self.maxpagenum = maxpagenum
def run(self):
areaCode = self.getAreaCode()
totalPage = None
with open(f'前途无忧招聘_关键词_{self.keyword}_城市_{self.city}.csv',
'w', newline='', encoding='gbk') as f:
f_csv = csv.DictWriter(f,
['职位名称',
'详细链接',
'公司名称',
'工作地点',
'薪资',
'发布时间',
'职位信息'])
f_csv.writeheader()
for pageNo in range(1,self.maxpagenum+1):
url = f'https://search.51job.com/list/{areaCode},000000,0000,00,9,99,{self.keyword},2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
print(url)
res = requests.get(url)
resBody = res.content.decode('gbk')
# print(resBody)
# 判断总共多少页
if totalPage is None:
pattern = r'id=\"hidTotalPage\".*?value=\"(.*?)\"'
tp = re.findall(pattern,resBody)[0]
print(f'总共 {tp} 页')
totalPage = int(tp)
pattern = r'<div class=\"el\">.*?class=\"t1.*?\">.*?<a.*?>(?P<job>.*?)</a>.*?class=\"t2\"><a.*?>(?P<company>.*?)</a>.*?class=\"t3\">(?P<addr>.*?)</span>.*?class=\"t4\">(?P<salary>.*?)</span>.*?class=\"t5\">(?P<date>.*?)</span>'
p = re.compile(pattern, re.DOTALL)
for match in p.finditer(resBody):
row = {
"职位名称": match.group('job').strip(),
"公司名称": match.group('company'),
"工作地点": match.group('addr'),
"薪资": match.group('salary'),
"发布时间": match.group('date'),
}
f_csv.writerow(row)
# 是否到了最后一页
if pageNo == totalPage :
break
def getAreaCode(self):
'''
经过抓包,分析得出,地区码的请求在
https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js
'''
res = requests.get('https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js')
part1 = res.text.split('area=')[1].split(';')[0]
code2area = eval(part1)
# print(code2area)
# 创建反向查询字典
area2code = {v: k for k, v in code2area.items()}
# print(area2code)
if self.city not in area2code:
print(f'查无此地: {self.city}')
exit(2)
return area2code[self.city]
QCWY(keyword='python', city='上海', maxpagenum=2).run()
点击这个链接 http://cdn1.python3.vip/files/py/phone.zip
下载zip包 ,解压后结构如下
phone
├─apple
└─samsung
├─note
└─s
这个目录里面对应了苹果、三星手机 的价格。
在相应目录里面,包含对应手机价格的python文件。
请同学们在维持目录结构不变的前提下,把这个目录结构做成名为python包。
然后,自己写一个Python程序调用 那个python包里面的每个
模块文件(共四个)里面的askPrice 函数,显示每种手机的价格
同学们先将那个phone包,和自己写的调用程序文件放在同一个目录下,
运行调用程序,显示各种手机价格
同学们再将那个phone包,和自己写的调用程序文件放在不同的目录下,
通过设置 sys.path 或者 环境变量PYTHONPATH,
来保证可以找到phone包,并成功调用。
import requests, csv
from datetime import datetime
def stockinfo(code,startdate,enddate):
payload = {'code': code, 'start': startdate, 'end': enddate}
res = requests.get("http://q.stock.sohu.com/hisHq", params=payload)
retObj = res.json()
historyData = retObj[0]['hq']
dateList = []
closePriceList = []
for dayInfo in historyData:
# 获取收盘价
dateList.append(dayInfo[0])
closePriceList.append(float(dayInfo[2]))
with open('stock.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
for idx,date in enumerate(dateList):
writer.writerow([date,closePriceList[idx]])
stockinfo('cn_600009', '20180720', '20181020')
import requests,json,time
from pprint import pprint
import datetime
import xlwt
# 先将一年中所有的日期存入列表,后面请求参数要用
d1 = datetime.date(2019, 1, 1)
d2 = datetime.date(2019, 12, 31)
days = [d1 + datetime.timedelta(days=x) for x in range((d2-d1).days + 1)]
# for day in days:
# print(day.strftime('%Y%m%d'))
# 爬取信息存入文件文件
def getAllInfo2Text():
# 循环获取每天数据信息
with open('info.txt','w',encoding='utf8') as f:
for day in days:
dayStr = day.strftime('%Y-%m-%d')
print( f"抓取数据 {dayStr} ..." ,end='')
# print('先请求 submitToken')
res = requests.post('https://app2.szpilot.com.cn/szg/token/submitToken',
data='{}')
ret = res.json()
# print(ret)
submitToken = ret['submitToken']
# 根据抓包得到的参数,发送同样的请求给网站获取数据
# print('再抓取当天航班')
webParam = json.dumps({"dutyId":"-1","jobAreaCode":"","showPresetPilot":"0","pageIndex":1,"date":f"{dayStr} 00:00:00","pageSize":900})
print(webParam)
https_proxy = "https://10.10.1.11:1080"
res = requests.post('https://app2.szpilot.com.cn/szg/pilotPlan/sendPilotPlan',
data={
'WEB_PARAM' : webParam,
'submitToken' : submitToken
},
headers={
'Cookie':'JSESSIONID=FB326C67F0874655F0EB19A5F6AF095B-n2.22jvm8081; Path=/szg; Secure; HttpOnly',
'Origin': 'https://app2.szpilot.com.cn',
'Referer': 'https://app2.szpilot.com.cn/szg_admin/',
},
# proxies = {'http':'http://127.0.0.1:8888'}
)
ret = res.json()
print(ret)
if not ret['pageDatas']['totalCount'] > 0:
print('没有记录')
continue
print('ok')
# 先存入文本文件
info = ret['pageDatas']['list']
f.write(json.dumps(info)+'\n')
# 太快会被认为是爬虫,拒绝服务,需要等待一下
time.sleep(1)
# 再从文件中分析数据,转存入 excel
def analyzeFile2Excel():
# 数据字段映射表
fieldMap = {
'dynamicTime' : '计划时间',
'shipNameCn' : '中文船名',
'shipNameEn' : '英文船名',
'shipFlag' : '国籍(地区)',
'shipLength' : '船长',
'draft' : '吃水',
'dynamicName' : '动态',
'startBerth' : '起点泊位',
'endBerth' : '终点泊位',
'master' : '主引',
'assistant' : '副引',
'assistant2' : '其它引水',
'orgShort' : '代理',
'dTelephone' : '代理电话',
'+++1' : '交通船',
'channelName' : '航道',
'headThruster:tailThruster' : '侧推',
'remarks' : '备注',
}
# 创建一个Excel workbook 对象
book = xlwt.Workbook()
# 增加一个名为 '年龄表' 的sheet
sh = book.add_sheet('引航')
# 写标题栏
for column, heading in enumerate(fieldMap.values()):
sh.write(0, column, heading)
with open('info.txt','r',encoding='utf8') as f:
row = 0
while True:
# 读入一行
oneline = f.readline()
if not oneline:
break
# 一行对应1天的数据
oneday = json.loads(oneline)
# 再获取当天 的每行数据
for line in oneday:
row += 1
# time.strftime('%Y%m%d %H:%M', time.localtime(int(line['dynamicTime'] / 1000)))
# pprint(line)
for column, field in enumerate(fieldMap.keys()):
# 没有的,暂时填空
if field not in line:
value = ''
else:
value = line[field]
# 计划时间要特殊处理
if field == 'dynamicTime' :
value = time.strftime('%Y%m%d %H:%M', time.localtime(int(line['dynamicTime'] / 1000)))
print(value)
# 侧推要特殊处理
if field == 'headThruster:tailThruster' :
if 'headThruster' not in line:
hvalue = '无'
else:
hvalue = line['headThruster']
if 'tailThruster' not in line:
tvalue = '无'
else:
tvalue = line['tailThruster']
value = f"首:{hvalue} 尾:{tvalue}"
sh.write(row, column, value)
# 保存文件
book.save('引航表.xls')
getAllInfo2Text()
analyzeFile2Excel()
print('\n\n === 完成 ==== \n\n')
参考答案