popeyelau / wiki

📒Wiki for many useful notes, source, commands and snippets.
2 stars 0 forks source link

快速使用 goquery 抓取网页内容 并转换成 json 输出 #1

Open popeyelau opened 5 years ago

popeyelau commented 5 years ago

参考资料

goquery Golang 中使用 JSON 的一些小技巧


package main

import (
    "encoding/json"
    "fmt"
    "net/http"
    "net/url"

    "github.com/PuerkitoBio/goquery"
)

type VidoeType int

const (
    VIDEO_FILM VidoeType = iota + 1
    VIDEO_DRAMA
    VIDEO_CARTOON
    VIDEO_VARIETY
)

type SortType int

func (this SortType) String() string {
    switch this {
    case SORT_BY_PLAY:
        return "play"
    case SORT_BY_UPDATE:
        return "update"
    case SORT_BY_LIKE:
        return "like"
    default:
        return "play"
    }
}

const (
    SORT_BY_PLAY SortType = iota + 1
    SORT_BY_UPDATE
    SORT_BY_LIKE
)

const (
    HOST string = "https://i.hyys.me"
)

func main() {
    home()
    video("42768")
    videos(VIDEO_FILM, SORT_BY_PLAY, 0)
}

// 首页
func home() {
    requestURL := fmt.Sprintf("%v/webIndex.php", HOST)
    doc, err := getDocument(requestURL)
    if err != nil {
        fmt.Println(err)
    }
    sectionDivs := doc.Find(".hideNavBar>div[class=container]")
    sections := make([]*Section, sectionDivs.Length())
    sectionDivs.Each(func(i int, s *goquery.Selection) {
        section := &Section{}
        section.Title = s.Find("p>a:first-of-type").Text()
        videoCards := s.Find(".videoContent>div[class=videoCard]")
        section.Items = parseVideos(videoCards)
        sections[i] = section
    })

    json, err := marshalIndent(sections)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Println(*json)
}

// 获取视频详情
func video(id string) {
    requestURL := fmt.Sprintf("%v/webIndex.php/?code=videoDetail&vod_id=%v", HOST, id)
    doc, err := getDocument(requestURL)
    if err != nil {
        fmt.Println(err)
        return
    }
    episodeHrefs := doc.Find(".dramaSeriesCont>a")
    episodes := make([]*Episode, episodeHrefs.Length())
    episodeHrefs.Each(func(i int, s *goquery.Selection) {
        episode := &Episode{}
        episode.Name = s.Text()
        if src, ok := s.Attr("data-href"); ok {
            episode.URL = src
        }
        episodes[i] = episode
    })

    json, err := marshalIndent(episodes)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Println(*json)
}

// 获取视频列表
func videos(typeID VidoeType, sort SortType, page int) {
    if page < 0 {
        page = 0
    }
    requestURL := fmt.Sprintf("%v/webIndex.php/?code=search&type=%d&most=%s&page=%d", HOST, typeID, sort, page)
    doc, err := getDocument(requestURL)
    if err != nil {
        fmt.Println(err)
        return
    }
    videoCards := doc.Find(".videoContent>div[class=videoCard]")
    videos := parseVideos(videoCards)
    json, err := marshalIndent(videos)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Println(*json)
}

// 解析视频 dom
func parseVideos(s *goquery.Selection) []*Video {
    videos := make([]*Video, s.Length())
    s.Each(func(i int, s *goquery.Selection) {
        video := &Video{}
        if src, ok := s.Attr("data-video-url"); ok {
            video.URL = src
            url, _ := (url.ParseRequestURI(src))
            video.ID = url.Query().Get("vod_id")
        }
        if cover, ok := s.Find(".videoCover").Attr("src"); ok {
            video.Cover = cover
        }
        video.Name = s.Find(".videoInfo>p[class=videoName]").Text()
        video.Views = s.Find(".videoInfo>div[class=jusBetween]>p").Text()
        videos[i] = video
    })
    return videos
}

// 转换成 json 格式
func marshalIndent(source interface{}) (*string, error) {
    bytes, err := json.MarshalIndent(source, "", "  ")
    if err != nil {
        fmt.Println(err)
        return nil, err
    }
    json := string(bytes)
    return &json, nil
}

// 获取页面内容
func getDocument(url string) (*goquery.Document, error) {
    client := &http.Client{}
    req, err := http.NewRequest(http.MethodGet, url, nil)
    if err != nil {
        return nil, err
    }
    req.Header.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36")
    resp, err := client.Do(req)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()
    doc, err := goquery.NewDocumentFromResponse(resp)
    if err != nil {
        return nil, err
    }
    return doc, nil
}

type Section struct {
    Title string   `json:"title"`
    Items []*Video `json:"items"`
}

type Video struct {
    ID    string `json:"id"`
    Name  string `json:"name"`
    URL   string `json:"-"` //不输出
    Cover string `json:"cover"`
    Views string `json:"views"`
}

type Episode struct {
    Name string `json:"name"`
    URL  string `json:"url"`
}
popeyelau commented 5 years ago

Golang 相关资料