Open v5tech opened 7 years ago
# -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import time import mysql.connector # 插入古诗 def insert_poems(title,author): cnx = mysql.connector.connect(user='root', password='root', host='192.168.99.142', database='sys') cursor = cnx.cursor() insert_poems = ("INSERT INTO `poems` (`title`,`author`) VALUES (%s,%s)") cursor.execute(insert_poems, (title,author)) cursor.close() cnx.commit() cnx.close() def fetchMingju(): page = 1 while(page<=114): url = 'http://so.gushiwen.org/mingju/Default.aspx?p='+str(page) res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'}) content = BeautifulSoup(res.content,'lxml') items = content.select('div[class="sons"]') for item in items: title = item.select('a')[0].text author = item.select('a')[1].text.replace('____','') insert_poems(title,author) print title,author page+=1 # 获取古诗 def fetchPoems(): page = 1 while(page<=200): url = 'http://so.gushiwen.org/type.aspx?p='+str(page) res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'}) content = BeautifulSoup(res.content,'lxml') items = content.select('div[class="sons"]') for item in items: title = item.select('p')[0].text url = 'http://so.gushiwen.org'+item.select('p > a')[0].attrs['href'] res = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'}) content = BeautifulSoup(res.content, 'lxml') print content.select('div[class="shileft"] h1')[0].text page += 1 time.sleep(2) # 获取作者 def fetchAuthors(): page = 1 while(page<=200): url = 'http://so.gushiwen.org/authors/Default.aspx?p='+str(page) res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'}) content = BeautifulSoup(res.content,'lxml') items = content.select('div[class="sonsauthor"]') for item in items: author = item.select('p')[0].text print author page += 1 time.sleep(2) def insert_stackoverflow(vote,answer,view,title): cnx = mysql.connector.connect(user='root', password='root', host='192.168.99.142', database='sys') cursor = cnx.cursor() insert_stackoverflow = ("INSERT INTO `stackoverflow` (`vote`,`answer`,`view`,`title`) VALUES (%s,%s,%s,%s)") cursor.execute(insert_stackoverflow, (vote,answer,view,title)) cursor.close() cnx.commit() cnx.close() def fetchstackoverflow(): page = 1 while(page<=200): url = 'http://stackoverflow.com/questions/tagged/java?page='+str(page)+'&sort=votes&pagesize=10' res = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'}) content = BeautifulSoup(res.content, 'lxml') items = content.select('div[class="question-summary"]') for item in items: vote = item.select('div strong')[0].text answer = item.select('div strong')[1].text view = item.select('div.views')[0]['title'].replace(',','').replace(' views','') title = item.select('a[class="question-hyperlink"]')[0].text print vote,answer,view,title insert_stackoverflow(vote,answer,view,title) print '-------------------->%d' % page page+=1 time.sleep(5) fetchstackoverflow() fetchMingju()