gocolly / colly

Elegant Scraper and Crawler Framework for Golang
https://go-colly.org/
Apache License 2.0
23.2k stars 1.76k forks source link

problem to get the response.Request.ProxyUrl field value because it will occasionally be empty #708

Open chjf2008 opened 2 years ago

chjf2008 commented 2 years ago

I use the huge proxies to crawl amazon website, but in onerror function, i can not get the response.Request.Proxy Url because it was not always display. I want to remove all the not working proxies. Thanks you. I have attched the output following and the full code. Output:

-----onError,ProxyURL:,error:Get "https://www.amazon.com/dp/B00XK69NRW": read tcp 192.168.49.15:61729->89.203.235.110:9999: read: connection reset by peer-----

-----onResponse,ProxyURL:http://157.100.52.149:999,status code:200----
------onError,ProxyURL:http://45.167.125.209:9992,error:context deadline exceeded (Client.Timeout or context cancellation while reading body)-----
------onError,ProxyURL:,error:Get "https://www.amazon.com/dp/B07SQXG8L1": context deadline exceeded (Client.Timeout exceeded while awaiting headers)-----
------onError,ProxyURL:http://200.58.87.195:8080,error:context deadline exceeded (Client.Timeout or context cancellation while reading body)-----
------onError,ProxyURL:,error:Get "https://www.amazon.com/dp/B004YD769C": proxyconnect tcp: dial tcp 118.42.15.57:4003: i/o timeout (Client.Timeout exceeded while awaiting headers)-----
------onError,ProxyURL:,error:Get "https://www.amazon.com/dp/B009UPUJIY": context deadline exceeded (Client.Timeout exceeded while awaiting headers)-----
------onError,ProxyURL:,error:Get "https://www.amazon.com/dp/B07HF3X6Y4": context deadline exceeded (Client.Timeout exceeded while awaiting headers)-----
------onError,ProxyURL:,error:Get "https://www.amazon.com/dp/B088SXVHP3": proxyconnect tcp: dial tcp 222.111.51.161:4007: i/o timeout (Client.Timeout exceeded while awaiting headers)-----
------onError,ProxyURL:,error:Get "https://www.amazon.com/dp/B01KIMOEW4": context deadline exceeded (Client.Timeout exceeded while awaiting headers)-----

Full code:

package Test

import (
    "crypto/tls"
    "fmt"
    "net/http"
    "time"
    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/proxy"
)

func RunTest() {
    c := colly.NewCollector()

    c.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"

    c.AllowURLRevisit = true
    c.SetRequestTimeout(time.Duration(20) * time.Second)
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: 10,
        Delay:       2 * time.Second,
        RandomDelay: 5 * time.Second,
    })

    c.WithTransport(&http.Transport{
        DisableKeepAlives: true,
        TLSClientConfig:   &tls.Config{InsecureSkipVerify: false},
    })

    c.Async = true

    proxies := []string{
        "http://152.26.231.77:9443",
        "http://205.155.45.139:3128",
        "http://209.80.129.2:3128",
        "http://45.167.125.209:9992",
        "http://222.111.51.161:4007",
        "http://212.160.115.5:8080",
        "http://200.58.87.195:8080",
        "http://118.42.15.57:4003",
        "http://157.100.52.149:999",
        "http://89.203.235.110:9999",
        "http://179.1.129.136:999",
        "http://181.65.154.174:999",
        "http://45.189.112.225:999",
        "http://181.48.23.250:8080",
        "http://45.7.133.236:999",
        "http://152.231.25.58:8080",
        "http://45.167.90.21:999",
        "http://102.38.5.161:8080",
        "http://187.102.236.209:999",
        "http://189.20.85.172:8080",
        "http://45.189.254.10:999",
        "http://103.11.106.148:8181",
        "http://61.144.152.209:9000",
        "http://201.217.55.97:8080",
        "socks5://193.164.134.126:9090",
        "socks5://5.161.86.206:1080",
        "socks5://5.161.100.145:1080",
        "socks5://217.12.201.56:8085",
        "socks5://5.161.93.53:1080",
        "socks5://157.90.107.62:1081",
        "socks5://167.114.100.218:51043",
        "socks5://159.69.153.169:5566",
        "socks5://37.18.73.60:5566",
        "socks5://112.54.33.47:7302"}

    asins := []string{"https://www.amazon.com/dp/B009UPUJIY",
        "https://www.amazon.com/dp/B00004OCLJ",
        "https://www.amazon.com/dp/B07QY8P3B1",
        "https://www.amazon.com/dp/B01KIMOEW4",
        "https://www.amazon.com/dp/B07HF3X6Y4",
        "https://www.amazon.com/dp/B01FRDSU1A",
        "https://www.amazon.com/dp/B088SXVHP3",
        "https://www.amazon.com/dp/B004YD769C",
        "https://www.amazon.com/dp/B007TJGZ54",
        "https://www.amazon.com/dp/B08Y5MZPTB",
        "https://www.amazon.com/dp/B07D19C9VG",
        "https://www.amazon.com/dp/B01JLPJM8U",
        "https://www.amazon.com/dp/B00XK69NRW",
        "https://www.amazon.com/dp/B006R9J5UO",
        "https://www.amazon.com/dp/B087RRWGJB",
        "https://www.amazon.com/dp/B091F621QS",
        "https://www.amazon.com/dp/B07BKVN8S4",
        "https://www.amazon.com/dp/B08865Q8Q8",
        "https://www.amazon.com/dp/B07SQXG8L1"}

    if _proxy, err := proxy.RoundRobinProxySwitcher(proxies...); err == nil {
        c.SetProxyFunc(_proxy)
    } else {
        fmt.Println(err)
    }
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    })

    c.OnResponse(func(res *colly.Response) {
        fmt.Printf("-----onResponse,ProxyURL:%s,status code:%d----\n", res.Request.ProxyURL, res.StatusCode)

    })

    c.OnError(func(res *colly.Response, err error) {

        fmt.Printf("------onError,ProxyURL:%s,error:%v-----\n", res.Request.ProxyURL, err)

    })

    for _, url := range asins {
        c.Visit(url)
    }
    c.Wait()

}
duncup commented 2 years ago

Got this too.

farshadff commented 1 year ago

could you fix your problem ?? i have the exact same problem too .