Le0nsec / SecCrawler

一个方便安全研究人员获取每日安全日报的爬虫和推送程序,目前爬取范围包括先知社区、安全客、Seebug Paper、跳跳糖、奇安信攻防社区、棱角社区以及绿盟、腾讯玄武、天融信、360等实验室博客,持续更新中。
GNU General Public License v3.0
889 stars 142 forks source link

洞见微信公众号文章聚合推送 #8

Closed AdminTest0 closed 2 years ago

AdminTest0 commented 2 years ago
package crawler

import (
    "SecCrawler/register"
    "SecCrawler/utils"
    "errors"
    "fmt"
    "io/ioutil"
    "net/http"
    "regexp"
    "strings"
    "time"
)

type DongJian struct{}

func (crawler DongJian) Config() register.CrawlerConfig {
    return register.CrawlerConfig{
        Name:        "DongJian",
        Description: "洞见微信聚合",
    }
}

// Get 获取洞见微信聚合前24小时内文章。
func (crawler DongJian) Get() ([][]string, error) {
    client := &http.Client{
        Timeout: time.Duration(4) * time.Second,
    }
    req, err := http.NewRequest("GET", "http://wechat.doonsec.com/rss.xml", nil)
    if err != nil {
        return nil, err
    }

    req.Header.Set("Cache-Control", "no-cache")
    req.Header.Set("Upgrade-Insecure-Requests", "1")
    req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36")
    req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
    req.Header.Set("Sec-Fetch-Site", "none")
    req.Header.Set("Sec-Fetch-Mode", "navigate")
    req.Header.Set("Sec-Fetch-User", "?1")
    req.Header.Set("Sec-Fetch-Dest", "document")
    req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
    resp, err := client.Do(req)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()

    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        return nil, err
    }

    bodyString := string(body)

    re := regexp.MustCompile(`<item><title>([\w\W]*?)</title><link>([\w\W]*?)</link><description>[\w\W]*?</description><author>[\w\W]*?</author><category>[\w\W]*?</category><pubDate>([\w\W]*?)</pubDate></item>`)
    result := re.FindAllStringSubmatch(strings.TrimSpace(bodyString), -1)

    var resultSlice [][]string
    fmt.Printf("[*] [DongJian] crawler result:\n%s\n\n", utils.CurrentTime())
    for _, match := range result {
        time_zone := time.FixedZone("CST", 8*3600)
        t, err := time.ParseInLocation(time.RFC1123Z, match[1:][2], time_zone)
        if err != nil {
            return nil, err
        }

        if !utils.IsIn24Hours(t.In(time_zone)) {
            // 默认时间顺序是从近到远
            break
        }

        // 去除title中的换行符
        re, _ = regexp.Compile(`\s{1,}`)
        match[1:][0] = re.ReplaceAllString(match[1:][0], "")

        fmt.Println(t.In(time_zone).Format("2006/01/02 15:04:05"))
        fmt.Println(match[1:][1])
        fmt.Printf("%s\n\n", match[1:][0])

        resultSlice = append(resultSlice, match[1:][0:2])
    }
    if len(resultSlice) == 0 {
        return nil, errors.New("no records in the last 24 hours")
    }
    return resultSlice, nil

}
Le0nsec commented 2 years ago

感谢您的提交,已经commit

Hatcat123 commented 2 years ago

全部文章 http://wechat.doonsec.com/rss.xml 筛选推荐 http://wechat.doonsec.com/bayes_rss.xml 筛选原创 http://wechat.doonsec.com/copyright_rss.xml 师傅。能不能在配置里面加上参数,可以选择采集不同类型的文章。

Le0nsec commented 2 years ago

可,我加一下 @Hatcat123