Open EdbertoLima opened 2 weeks ago
Welcome! Your issue will be analyzed as soon as possible. Hopefully, we can find a solution to the problem together, please try to provide as much information as possible to help us identify and fix the bug or improve the repository.
I'm getting the same error (tried from local, through VPN and using Google Colab). From inspecting the network it seems that the problem is not with the library but with the actual server (https://consultapublica.car.gov.br/publico/...).
I encountered 2 main problems:
Both of the issues are connected to maintaining progress while implementing some kind of a "retry" mechanism inside the library.
I created a bad, hacky solution that I wouldn't use in production code for maintaining some kind of progress inside the library, patching one of the inner functions.
from types import MethodType
from types import MethodType
from pathlib import Path
import os
import time
from tqdm import tqdm
import httpx
from urllib.parse import urlencode
from SICAR.exceptions import (
UrlNotOkException,
FailedToDownloadPolygonException,
)
from collections import deque
def custom_download_polygon(
self,
state,
polygon,
captcha: str,
folder: str,
chunk_size: int = 1024,
max_retries: int = 200,
retry_delay: int = 1,
min_speed_threshold: int = 100, # Minimum speed in bytes per second
speed_check_interval: int = 10, # Number of chunks to average over for speed check
) -> Path:
query = urlencode({"idEstado": state.value, "tipoBase": polygon.value, "ReCaptcha": captcha})
path = Path(os.path.join(folder, f"{state.value}_{polygon.value}")).with_suffix(".zip")
headers = {}
# Check if a partial file already exists and set Range header
if path.exists():
current_size = path.stat().st_size
headers["Range"] = f"bytes={current_size}-"
else:
current_size = 0
retries = 0
while retries < max_retries:
try:
with self._session.stream("GET", f"{self._DOWNLOAD_BASE}?{query}", headers=headers) as response:
if response.status_code not in (httpx.codes.OK, httpx.codes.PARTIAL_CONTENT):
raise UrlNotOkException(f"{self._DOWNLOAD_BASE}?{query}")
content_length = int(response.headers.get("Content-Length", 0))
total_size = content_length + current_size
content_type = response.headers.get("Content-Type", "")
if content_length == 0 or not content_type.startswith("application/zip"):
raise FailedToDownloadPolygonException()
# Resume or start a new download
mode = "ab" if current_size > 0 else "wb"
with open(path, mode) as fd:
with tqdm(
initial=current_size,
total=total_size,
unit="iB",
unit_scale=True,
desc=f"Downloading polygon '{polygon.value}' for state '{state.value}'",
) as progress_bar:
speed_history = deque(maxlen=speed_check_interval)
start_time = time.time()
for chunk in response.iter_bytes(chunk_size=chunk_size):
fd.write(chunk)
progress_bar.update(len(chunk))
# Update the end time after each chunk
end_time = time.time()
# Track the download speed every few chunks based on the interval
if len(speed_history) < speed_check_interval:
speed = len(chunk) / (end_time - start_time) / 10
speed_history.append(speed)
else:
avg_speed = sum(speed_history) / len(speed_history)
if avg_speed < min_speed_threshold:
raise httpx.ReadTimeout(f"Average download speed too low: {avg_speed:.2f} bytes/sec")
start_time = end_time
speed_history.clear()
print("Download completed successfully.")
return path # Exit if download is complete
except (httpx.RequestError, httpx.ReadTimeout) as e:
retries += 1 # Increment retries before printing
print(f"Retry {retries}/{max_retries} after error: {e}")
time.sleep(retry_delay)
# If we exit the loop, the download failed after all retries
raise FailedToDownloadPolygonException("Max retries exceeded. Download failed.")
car._download_polygon = MethodType(custom_download_polygon, car)
Known issues with the code above:
Then, I used the following command to run and download a specific dataset:
result = car.download_state(state=State.RS, polygon=Polygon.APPS, folder='drive/MyDrive/SICAR/', debug=True, chunk_size=1024, tries=200)
The tries
is the original parameter for retrying the library's CAPTCHA.
Hope this helps!
Bom dia,
Estou tentando baixar múltiplas camadas do Cadastro Ambiental Rural (CAR) para o estado do Rio Grande do Sul (RS) usando Python 3.13.0 em MacOS 15.1 (24B83). No entanto, o processo falha e retorna a seguinte mensagem de erro:
Downloading polygon 'RESERVA_LEGAL' for state 'RS': 2%| | 13.2M/740M [00:22<21:03, 5 Traceback (most recent call last): File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/httpx/_transports/default.py", line 72, in map_httpcore_exceptions yield File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/httpx/_transports/default.py", line 116, in iter for part in self._httpcore_stream: ^^^^^^^^^^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 367, in iter raise exc from None File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 363, in iter for part in self._stream: ^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/httpcore/_sync/http11.py", line 349, in iter raise exc File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/httpcore/_sync/http11.py", line 341, in iter for chunk in self._connection._receive_response_body(**kwargs):