gocolly / colly

Elegant Scraper and Crawler Framework for Golang
https://go-colly.org/
Apache License 2.0
23.25k stars 1.76k forks source link

Why I can't do same in Colly as in Goquery #294

Closed tobi007 closed 5 years ago

tobi007 commented 5 years ago

please I trying to learn webscrapping using goquery and colly. The above code is from http://sandipbgt.com/2018/08/23/scraping-tutorial-with-golang/ for getting list of personal projects. The above code works, but I'm trying to do the same using colly but it's not working, I can't even get it to log in. Find the code below when I used colly.

package main

import (
    "fmt"
    "io/ioutil"
    "log"
    "net/http"
    "net/http/cookiejar"
    "net/url"
    "strings"

    "github.com/PuerkitoBio/goquery"
)

const (
    baseURL = "https://gitlab.com"
)

var (
    username = "your gitlab username"
    password = "your gitlab password"
)

type App struct {
    Client *http.Client
}

type AuthenticityToken struct {
    Token string
}

type Project struct {
    Name string
}

func (app *App) getToken() AuthenticityToken {
    loginURL := baseURL + "/users/sign_in"
    client := app.Client

    response, err := client.Get(loginURL)

    if err != nil {
        log.Fatalln("Error fetching response. ", err)
    }

    defer response.Body.Close()

    document, err := goquery.NewDocumentFromReader(response.Body)
    if err != nil {
        log.Fatal("Error loading HTTP response body. ", err)
    }

    token, _ := document.Find("input[name='authenticity_token']").Attr("value")

    authenticityToken := AuthenticityToken{
        Token: token,
    }

    return authenticityToken
}

func (app *App) login() {
    client := app.Client

    authenticityToken := app.getToken()

    loginURL := baseURL + "/users/sign_in"

    data := url.Values{
        "authenticity_token": {authenticityToken.Token},
        "user[login]":        {username},
        "user[password]":     {password},
    }

    response, err := client.PostForm(loginURL, data)

    if err != nil {
        log.Fatalln(err)
    }

    defer response.Body.Close()

    _, err = ioutil.ReadAll(response.Body)
    if err != nil {
        log.Fatalln(err)
    }
}

func (app *App) getProjects() []Project {
    projectsURL := baseURL + "/dashboard/projects"
    client := app.Client

    response, err := client.Get(projectsURL)

    if err != nil {
        log.Fatalln("Error fetching response. ", err)
    }

    defer response.Body.Close()

    document, err := goquery.NewDocumentFromReader(response.Body)
    if err != nil {
        log.Fatal("Error loading HTTP response body. ", err)
    }

    var projects []Project

    document.Find(".project-name").Each(func(i int, s *goquery.Selection) {
        name := strings.TrimSpace(s.Text())
        project := Project{
            Name: name,
        }

        projects = append(projects, project)
    })

    return projects
}

func main() {
    jar, _ := cookiejar.New(nil)

    app := App{
        Client: &http.Client{Jar: jar},
    }

    app.login()
    projects := app.getProjects()

    for index, project := range projects {
        fmt.Printf("%d: %s\n", index+1, project.Name)
    }
}

This is the code trying it with colly. Please what am I missing.

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "github.com/gocolly/colly/debug"
    "log"
)

const (
    baseURL = "https://gitlab.com"
)

var (
    username = "your gitlab username"
    password = "your gitlab password
)

type App struct {
    Client *colly.Collector
}

type AuthenticityToken struct {
    Token string
}

type Project struct {
    Name string
}

func (app *App) getToken() AuthenticityToken {
    client := app.Client.Clone()
    loginURL := baseURL + "/users/sign_in"
    authenticityToken := AuthenticityToken{}

    // On every a element which has href attribute call callback
    client.OnHTML(".new_user > input[name='authenticity_token']", func(e *colly.HTMLElement) {
        token := e.Attr("value")
        authenticityToken.Token = token
        // Print link
        fmt.Printf("Link found: %q -> %s\n", e.Text, token) // Visit link found on page
    })

    // Start scraping on https://hackerspaces.org
    client.Visit(loginURL)
    client.Wait()

    return authenticityToken
}

func (app *App) login() {
    client := app.Client
    loginURL := baseURL + "/users/sign_in/"
    projectsURL := baseURL + "/dashboard/projects/"

    data := map[string]string{
        "authenticity_token": app.getToken().Token,
        "user[login]":        username,
        "user[password]":     password,
        "user[remember_me]": "0",
    }

    // On every a element which has href attribute call callback
    client.OnHTML(".project-name", func(e *colly.HTMLElement) {
        token := e.Attr("value")
        //authenticityToken.Token = token
        // Print link
        fmt.Printf("Link found: %q -> %s\n", e.Text, token) // Visit link found on page
    })

    // attach callbacks after login
    client.OnResponse(func(r *colly.Response) {
        log.Println("response received", string(r.Body))
    })

    err := client.Post(loginURL, data)
    if err != nil {
        log.Fatalln("Error fetching response. ", err)
    }
    client.Wait()
    // Start scraping on https://hackerspaces.org
    client.Visit(projectsURL)

}

func main()  {
    c := colly.NewCollector(
        colly.AllowURLRevisit(),
        colly.MaxDepth(1),
        colly.Debugger(&debug.LogDebugger{}),
        colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"),
    )
    app := App{c}
    app.login()
}
vosmith commented 5 years ago

I've been messing with this for a few hours. I've been able to get it to work, if I disable the redirect after login. I don't think the redirect specifically is causing the problem...trying to dig down further.

Basically, I added a c.RedirectHandler() that just returns http.ErrUseLastResponse. Also, I ignore the error from the client.Post(loginURL,data) as it propogates that error. After that, the Visit to the projectsURL goes through without an issue...but I think I'm missing the real issue here

vosmith commented 5 years ago

Found the issue in the source code. The checkRedirectFunc() by default copies the request headers from the previous request. In this case, theCookie from the lastRequest was no longer valid as it was replaced by the response from signin. Sending this old Cookie invalidates the session and automatically logs you out of the system.

As a work around, you can override the RedirectHandler() so that it just returns nil, so it will not modify the current request. I will submit a PR to address this issue.

Thanks for the report!