zhulinpinyu / zhulinpinyu.github.io

Blog
http://blog.zhulinpinyu.com
Apache License 2.0
2 stars 0 forks source link

预报爬虫 #17

Open zhulinpinyu opened 4 years ago

zhulinpinyu commented 4 years ago
import requests
from bs4 import BeautifulSoup

def weather(url):
  html_doc = get_raw_html(url)
  data = extract_data(html_doc)
  return data

def get_raw_html(url):
  headers = {
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"
  }
  res = requests.get(url, headers=headers)
  return res.text

def extract_data(doc):
  soup = BeautifulSoup(doc, "lxml")
  box = soup.find('div', class_='n-container')
  today_box = box.find('div', class_='c-left')
  week_box = box.find('div', class_='c-right')
  box_24h = soup.find('div', class_='cleft-24hours')

  #current
  location = box.select_one('a')['cname']
  temp = today_box.select_one('span.num').contents[0]
  desc = today_box.select_one('p.text').string
  date = today_box.select_one('a.date').string.strip()
  wind = today_box.select_one('span.wind').contents[2]
  hundity = today_box.select_one('span.hundity').contents[2]
  aqi = today_box.select_one('span.liv-text > a > em').string
  aq = today_box.select_one('span.liv-text > a > span.liv-img').string
  updated_at = today_box.select_one('div.row4 > p').string

  #7d
  dates = [tag.string for tag in week_box.findAll('p', class_='date')]
  texts = [tag.string for tag in week_box.findAll('p', class_='text')]
  deses = [tag.string for tag in week_box.findAll('p', class_='des')]
  winds = [tag.string for tag in week_box.findAll('p', class_='wind')]
  maxTemps = week_box.select_one('div.r-temp')['data-high'].split(",")
  minTemps = week_box.select_one('div.r-temp')['data-low'].split(",")
  forecast7d = list(
    map(
      lambda tup: {
        'date': tup[0],
        'text': tup[1],
        'des': tup[2],
        'maxTemp': tup[3],
        'minTemp': tup[4],
        'wind': tup[5],
      },
      zip(dates,texts,deses,maxTemps,minTemps,winds)
    )
  )

  #24h
  times = [tag.string for tag in box_24h.findAll('p', class_='time')]
  temps = [tag.string for tag in box_24h.findAll('p', class_='temp')]
  forecast24h = list(
    map(
      lambda tup: {
        'time': tup[0],
        'temp': tup[1]
      },
      zip(times, temps)
    )
  )

  return {
    'current': {
      '位置': location,
      '温度': temp,
      '天气': desc,
      'date': date,
      '风力': wind,
      '湿度': hundity,
      'AQI': aqi,
      '空气质量': aq,
      'updated_at': updated_at
    },
    'forecast7d': forecast7d,
    'forecast24h': forecast24h
  }

data = weather("http://tianqi.sogou.com/pc/weather/2332634")
print(data['current']['date'])
print(data['current']['位置'], data['current']['天气'], data['current']['温度']+'°', data['current']['湿度'], data['current']['风力'])
print('AQI: '+data['current']['AQI'])
print('空气质量: ' + data['current']['空气质量'])
print(data['current']['updated_at'])