v5tech / notes

notes
https://ameizi.gitee.io/notes
MIT License
1.52k stars 378 forks source link

使用Python抓取古诗词 #154

Open v5tech opened 7 years ago

v5tech commented 7 years ago
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import time
import mysql.connector

# 插入古诗
def insert_poems(title,author):
    cnx = mysql.connector.connect(user='root', password='root',
                                  host='192.168.99.142',
                                  database='sys')
    cursor = cnx.cursor()
    insert_poems = ("INSERT INTO `poems` (`title`,`author`) VALUES (%s,%s)")
    cursor.execute(insert_poems, (title,author))
    cursor.close()
    cnx.commit()
    cnx.close()

def fetchMingju():
    page = 1
    while(page<=114):
        url = 'http://so.gushiwen.org/mingju/Default.aspx?p='+str(page)
        res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
        content = BeautifulSoup(res.content,'lxml')
        items = content.select('div[class="sons"]')
        for item in items:
            title = item.select('a')[0].text
            author = item.select('a')[1].text.replace('____','')
            insert_poems(title,author)
            print title,author
        page+=1

# 获取古诗
def fetchPoems():
    page = 1
    while(page<=200):
        url = 'http://so.gushiwen.org/type.aspx?p='+str(page)
        res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
        content = BeautifulSoup(res.content,'lxml')
        items = content.select('div[class="sons"]')
        for item in items:
            title = item.select('p')[0].text
            url = 'http://so.gushiwen.org'+item.select('p > a')[0].attrs['href']
            res = requests.get(url, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
            content = BeautifulSoup(res.content, 'lxml')
            print content.select('div[class="shileft"] h1')[0].text
        page += 1
        time.sleep(2)

# 获取作者
def fetchAuthors():
    page = 1
    while(page<=200):
        url = 'http://so.gushiwen.org/authors/Default.aspx?p='+str(page)
        res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
        content = BeautifulSoup(res.content,'lxml')
        items = content.select('div[class="sonsauthor"]')
        for item in items:
            author = item.select('p')[0].text
            print author
        page += 1
        time.sleep(2)

def insert_stackoverflow(vote,answer,view,title):
    cnx = mysql.connector.connect(user='root', password='root',
                                  host='192.168.99.142',
                                  database='sys')
    cursor = cnx.cursor()
    insert_stackoverflow = ("INSERT INTO `stackoverflow` (`vote`,`answer`,`view`,`title`) VALUES (%s,%s,%s,%s)")
    cursor.execute(insert_stackoverflow, (vote,answer,view,title))
    cursor.close()
    cnx.commit()
    cnx.close()

def fetchstackoverflow():
    page = 1
    while(page<=200):
        url = 'http://stackoverflow.com/questions/tagged/java?page='+str(page)+'&sort=votes&pagesize=10'
        res = requests.get(url, headers={
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
        content = BeautifulSoup(res.content, 'lxml')
        items = content.select('div[class="question-summary"]')
        for item in items:
            vote = item.select('div strong')[0].text
            answer = item.select('div strong')[1].text
            view = item.select('div.views')[0]['title'].replace(',','').replace(' views','')
            title = item.select('a[class="question-hyperlink"]')[0].text
            print vote,answer,view,title
            insert_stackoverflow(vote,answer,view,title)
        print '-------------------->%d' % page
        page+=1
        time.sleep(5)

fetchstackoverflow()

fetchMingju()