cuducos / minha-receita

🏢 Sua API web para consulta de informações do CNPJ da Receita Federal
https://minhareceita.org
MIT License
1.29k stars 129 forks source link

Reverter `got` #148

Closed cuducos closed 1 year ago

cuducos commented 1 year ago

O PR #145 tinha boa intenção mas, depois de utilizar o got integrado por 2 semanas, não consgeui completar o download de todos os arquivos. Acredito que valha a pena remover got.

cuducos commented 1 year ago

Reescrevi a estratégia do got de uma forma que teremos mais controle sobre os downloads. Vou colar aqui para não perder, mas a ideia passa a ser remover o got e implementar essa lógica (com testes e tudo mais).

Salve esse arquivo como main.go em algum diretório, e use com go run main.go TAMANHO-DE-CADA-PEDAÇO URL DESTINO, por exemplo go run main.go 1000000 http://200.152.38.155/CNPJ/Paises.zip Paises.zip para baixar o arquivo em pedaços de 1Mb.

package main

import (
    "fmt"
    "io"
    "log"
    "math"
    "net/http"
    "os"
    "strconv"
)

const retries = 7

type chunk struct {
    idx              int
    retries          int
    start, end, size uint64
    err              error
    contents         []byte
}

func newChunk(idx int, start, end uint64) chunk {
    c := chunk{idx: idx, retries: retries, start: start, end: end}
    c.size = end - start + 1
    return c
}

func newChunkFrom(chunk chunk) chunk { return newChunk(chunk.idx, chunk.start, chunk.end) }

type chunckDowloader struct {
    url    string
    path   string
    client *http.Client
    done   chan chunk
}

func (c *chunckDowloader) downloadChunk(chunk chunk) {
    log.Output(2, fmt.Sprintf("starting to download chunk %d…", chunk.idx))   // TODO: remove
    defer log.Output(2, fmt.Sprintf("done to download chunk %d…", chunk.idx)) // TODO: remove
    defer func() { c.done <- chunk }()
    req, err := http.NewRequest("GET", c.url, nil)
    if err != nil {
        chunk.err = fmt.Errorf("could not create a request: %w", err)
        return
    }
    req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", chunk.start, chunk.end))
    resp, err := c.client.Do(req)
    if err != nil {
        chunk.err = fmt.Errorf("error sending the http request: %w", err)
        return
    }
    defer resp.Body.Close()
    if resp.ContentLength != int64(chunk.size) {
        chunk.err = fmt.Errorf("got wrong content-length, expected %d, got %d", chunk.size, resp.ContentLength)
        return
    }
    chunk.contents, err = io.ReadAll(resp.Body)
    if err != nil {
        chunk.err = fmt.Errorf("could not read chunk response body: %w", err)
        return
    }
    if err != nil {
        chunk.err = fmt.Errorf("could not write chunk to file: %w", err)
    }
}

func (c *chunckDowloader) getSize() (uint64, error) {
    r, err := c.client.Head(c.url)
    if err != nil {
        return 0, fmt.Errorf("error sending a http head request: %s", err)
    }
    defer r.Body.Close()
    if r.ContentLength <= 0 {
        return 0, fmt.Errorf("got content-lenght %d", r.ContentLength)
    }
    if r.Header.Get("Accept-Ranges") == "none" {
        return 0, fmt.Errorf("server does not accept http range requests")
    }
    return uint64(r.ContentLength), nil
}

func download(client *http.Client, chunkSize uint64, url, path string) error {
    c := chunckDowloader{
        url:    url,
        path:   path,
        client: client,
        done:   make(chan chunk),
    }
    size, err := c.getSize()
    if err != nil {
        return fmt.Errorf("could not get the size of %s: %w", url, err)
    }
    log.Output(2, fmt.Sprintf("file size = %d", size)) // TODO: remove
    count := uint64(math.Ceil(float64(size) / float64(chunkSize)))
    log.Output(2, fmt.Sprintf("total chunks = %d", count)) // TODO: remove
    var idx int
    var start, end uint64
    for {
        if start > size {
            break
        }
        end = (start + chunkSize) - 1
        if end > (size - 1) {
            end = size - 1
        }
        log.Output(2, fmt.Sprintf("chunk %d from %d to %d", idx, start, end)) // TODO: remove
        go c.downloadChunk(newChunk(idx, start, end))
        start += chunkSize - 1
        idx++
    }
    dest, err := os.Create(path)
    if err != nil {
        return fmt.Errorf("could not create %s: %w", path, err)
    }
    defer dest.Close()
    if err := dest.Truncate(int64(size)); err != nil {
        return fmt.Errorf("could not truncate %s to %d bytes: %w", path, size, err)
    }
    var completed uint64
    for {
        chunk := <-c.done
        if chunk.err != nil {
            log.Output(2, fmt.Sprintf("error downloading chunk #%d: %s", chunk.idx+1, err)) // TODO: remove
            if chunk.retries > 0 {
                log.Output(2, fmt.Sprintf("retrying chunk #%d…", chunk.idx+1)) // TODO: remove
                go c.downloadChunk(newChunkFrom(chunk))
            } else {
                return fmt.Errorf("could not download %s: %w", url, chunk.err)
            }
        }
        dest.WriteAt(chunk.contents, int64(chunk.start))
        completed++
        if completed == count {
            close(c.done)
            break
        }
    }
    return nil
}

func main() {
    if len(os.Args) != 4 {
        log.Fatal("missing CHUNK SIZE and/or URL and/or DESTINATION.\nUsage: chunk CHUNK SIZE URL DESTINATION")
    }
    chunkSize, err := strconv.ParseUint(os.Args[1], 10, 64)
    if err != nil {
        log.Fatalf("%s is not a vaklid CHUNK SIZE", os.Args[1])
    }
    if err := download(&http.Client{}, chunkSize, os.Args[2], os.Args[3]); err != nil {
        log.Fatal(err)
    }
}