DaishanHuang / titanic

practice for titanic
0 stars 0 forks source link

crawl #1

Closed DaishanHuang closed 7 years ago

DaishanHuang commented 7 years ago

-- coding: utf-8 --

""" Spyder Editor

This is a temporary script file. """ author = 'Lining'

-- coding:utf-8 --

import requests,json,re,time,datetime,socket,pyodbc from urllib import request import pandas as pd from pandas import DataFrame,Series from bs4 import BeautifulSoup

def lagou_spider_keyword(keyword):

将搜索字符串转换为utf-8编码,之后进行lagou.com搜索url构造

keywordbyte=keyword.encode('utf-8')
keywordindex=str(keywordbyte).replace(r'\x','%').replace(r"'","")
keywordindex=re.sub('^b','',keywordindex)

#计算总共有多少搜索结果页
i =0
type='true'
url='http://www.lagou.com/jobs/positionAjax.json?px=default&first='+type+'&kd='+keywordindex+'&pn='+str(i+1)
with request.urlopen(url) as f:
    data=f.read()
    urlcount=int(json.loads(str(data,encoding='utf-8',errors='ignore'))["content"]["totalPageCount"])
    print('本次搜索页面共计%d'%urlcount)

#开始正式抓取
for i in list(range(0,urlcount)):

    #构造页面
    if i ==0 :
        type='true'
    else:
        type='false'
    url='http://www.lagou.com/jobs/positionAjax.json?px=default&first='+type+'&kd='+keywordindex+'&pn='+str(i+1)
    with request.urlopen(url) as f:
        data=f.read()

    #读取json数据,开始解析
    try:
        jsondata=json.loads(str(data,encoding='utf-8',errors='ignore'))["content"]['result']

        for t in list(range(len(jsondata))):
            #把company描述的列表合并为一个字符串
            jsondata[t]['companyLabelList2']='-'.join(jsondata[t]['companyLabelList'])
            jsondata[t].pop('companyLabelList')

            #将每一行数据做成Series,之后再合并
            if t == 0:
                rdata=DataFrame(Series(data=jsondata[t])).T
            else:
                rdata=pd.concat([rdata,DataFrame(Series(data=jsondata[t])).T])
        #重新给rdata编码
        rdata.index=range(1,len(rdata)+1)
        rdata['keyword']=keyword
        rdata['salarymin']=0
        rdata['salarymax']=0
        rdata['url']=''
        rdata['jd']=''#职位描述
        rdata['handle_perc']=''#简历及时处理率,在七天内处理完简历占所有简历的比例
        rdata['handle_day']=''#完成简历处理平均天数
        for klen in list(range(len(rdata['salary']))):
            rdata.ix[klen+1,'salarymin'] = re.search('^(\d*?)k',rdata['salary'].iloc[klen]).group(1)
            #如果工资的最大值没有写,如(8k以上),则列为空值
            if re.search('-(\d*?)k$',rdata['salary'].iloc[klen]) != None:
                rdata.ix[klen+1,'salarymax'] = re.search('-(\d*?)k$',rdata['salary'].iloc[klen]).group(1)
            else:
                rdata.ix[klen+1,'salarymax'] = ''
            #增加url一列,便于后续抓取jd内容
            rdata.ix[klen+1,'url'] =  'http://www.lagou.com/jobs/%s.html'% rdata.ix[klen+1,'positionId']

            #对url进行二次抓取,把jd抓进来
            with request.urlopen(rdata.ix[klen+1,'url']) as f:
                data_url=f.read()
                soup_url=BeautifulSoup(data_url,'html5lib')
                strings_url=soup_url.find('dd',class_='job_bt').strings
                rdata.ix[klen+1,'jd']=''.join(strings_url).encode('gbk','ignore').decode('gbk','ignore').replace(' ','')
                temp=soup_url.find_all('span',class_='data')
                if re.search('>(\w*%)<',str(temp[0])) == None:
                    rdata.ix[klen+1,'handle_perc']=''
                else:
                    rdata.ix[klen+1,'handle_perc']=re.search('>(\w*%)<',str(temp[0])).group(1)
                rdata.ix[klen+1,'handle_day']=re.search('>(\w*)<',str(temp[1])).group(1).replace('天','')

    except Exception:
        print(Exception)
        continue
    #构造totaldata,是所有页面的集合,rdata是这一个页面的集合
    if i == 0:
         totaldata=rdata
    else:
         totaldata=pd.concat([totaldata,rdata])

    totaldata.index=range(1,len(totaldata)+1)
    print('正在抓取搜索页面第%d页,时间是%s,还剩下%d页'%(i+1,datetime.datetime.now(),urlcount-i-1))

#开始写入数据库
totaldata.to_excel('lagou.xls',sheet_name='sheet1')

if name=='main': keyword = input("请输入搜索词(回车进入下一步): ")

keyword='数据挖掘' #可以随意定义搜索词

lagou_spider_keyword(keyword)
DaishanHuang commented 7 years ago

记得运行不了,就扔着了……2017年3月30日