pelias / docker

Run the Pelias geocoder in docker containers, including example projects.
MIT License
314 stars 217 forks source link

North America openaddresses missing data and incomplete #338

Open coldicefisher opened 6 months ago

coldicefisher commented 6 months ago

Use-cases

The openaddresses links are incomplete/out of sync. Also, there is also no way of fetching differences.

Attempted Solutions

I built two Python scripts to solve my issue. The first script builds a hash and updates the pelias.json with all the us, ca, and mx files. The second script builds an index of the DATA_DIR and reconciles the links on openaddresses.io. Any missing files, it downloads.

Proposal

I am still working with this code. But, I can dockerize it and add a pelias command if it's useful. It may be helpful just to have an updated pelias.json.

Updated pelias.json:

pelias.json

Build new pelias.json script:

import requests
from bs4 import BeautifulSoup
import json

url = "https://results.openaddresses.io"

response = requests.get(url)
html_content = response.text

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find the table with id "runs"
runs_table = soup.find('table', {'id': 'runs'})

# Extract data from the "processed" column in the "runs" table
files = []
# Select only the desired countries: us, mx, ca
countries = ["us", "mx", "ca"]

# Find all rows in the "runs" table
rows = runs_table.find_all('tr')

for row in rows:
    # Find the "processed" column in each row
    processed_column = row.find('td', {'class': 'processed'})

    if processed_column:
        # Find the link in the "processed" column
        link = processed_column.find('a')

        if link:

            # Extract country code from the link
            country_code = link['href'].split('/')[5]
            state_code = link['href'].split('/')[6]

            if country_code in countries:
                # Extract the filename and add it to the JSON object
                filename = link['href'].split('/')[-1].replace('.zip', '.csv')
                files.append(f"{country_code}/{state_code}/{filename}")

# Serialize the JSON object to a string and print it
# pelias_json_str = json.dumps(pelias_json, indent=2)
# print(pelias_json_str)

# Read the existing pelias.json file
with open('pelias.json', 'r') as pelias_file:
    pelias_data = json.load(pelias_file)

# Modify the imports section with the new data
pelias_data["imports"]["openaddresses"]["files"] = files

# Write the updated content to a new file
with open('pelias.scraped.json', 'w') as new_pelias_file:
    json.dump(pelias_data, new_pelias_file, indent=2)

print("pelias.scraped.json has been updated.")

Reconcile current open addresses files with URL:

import os
import requests
import zipfile
import json
from dotenv import load_dotenv
from datetime import datetime
from bs4 import BeautifulSoup

# Load environment variables from .env file
load_dotenv()

countries = ["us", "mx", "ca"]

# Define the data directory from the environment variable
data_dir = os.getenv("DATA_DIR")

# Read the pelias.json file
with open('pelias.json', 'r') as pelias_file:
    pelias_data = json.load(pelias_file)

# Create a dictionary to store filename and corresponding link
filename_link_dict = {}

# Fetch links from the webpage
url = "https://results.openaddresses.io"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
runs_table = soup.find('table', {'id': 'runs'})

# Iterate through the rows of the runs table and extract links
for row in runs_table.find_all('tr'):
    processed_column = row.find('td', {'class': 'processed'})

    if processed_column:
        link = processed_column.find('a')

        if link:
            country, state, filename = link['href'].split('/')[-3:]
            if country not in countries:
                continue

            filename_link_dict[(country, state, filename.replace('.zip', '.csv'))] = link['href']

# Ensure the data directory and openaddresses subdirectory exist
openaddresses_dir = os.path.join(data_dir, 'openaddresses')
os.makedirs(openaddresses_dir, exist_ok=True)

# Log file path
log_file_path = 'openaddresses.custom_parser.log'

# Function to log events
def log_event(event_type, filename):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_message = f"{timestamp} {event_type}: {filename}."
    with open(log_file_path, 'a') as log_file:
        log_file.write(log_message + '\n')
    print(log_message)

for file_tuple, url in filename_link_dict.items():
    country, state, filename = file_tuple
    print ("Creating:", openaddresses_dir, country, state, filename)
    file_path = os.path.join(openaddresses_dir, country, state, filename)
    dir_path = os.path.join(openaddresses_dir, country, state)
    # Create the directory if it does not exist
    os.makedirs(dir_path, exist_ok=True)

    # Check if the file already exists
    if os.path.exists(file_path):
        log_event("file_exists", filename)
    else:
        log_event("file_not_found", filename)

        # Download the file
        response = requests.get(url)
        try:
            with open(file_path + '.zip', 'wb') as zip_file:
                zip_file.write(response.content)

            # Extract the contents to the correct folder
            with zipfile.ZipFile(file_path + '.zip', 'r') as zip_ref:
                zip_ref.extractall(openaddresses_dir)
        except Exception as e:
            log_event("download_error", filename)
            print(e)
            continue

        # Remove the downloaded zip file
        os.remove(file_path + '.zip')

print("Script execution complete.")

References

No.