mampfes / hacs_waste_collection_schedule

Home Assistant integration framework for (garbage collection) schedules
MIT License
1.03k stars 642 forks source link

MTL #1226

Closed antoinebou12 closed 9 months ago

antoinebou12 commented 1 year ago

https://montreal.ca/en/info-collections

5ila5 commented 1 year ago

Do you know of an easy way to get a collection calendar of some sort from the website parsing something like this (https://montreal.ca/en/info-collections/results?postalCode=H1S+1S5&civicNumber=4564) is a nightmare. You would need to extract dates from this text which does not follow simple rules where dates are positioned.

So the best way would probably be the static source. There you can set reoccurring events with exceptions and additions. It's a bit more work to set up but I think parsing the response from the website is not a very good idea

antoinebou12 commented 1 year ago

you can look at that too https://donnees.montreal.ca/dataset/info-collectes https://donnees.montreal.ca/dataset/info-collectes/resource/5f3fb372-64e8-45f2-a406-f1614930305c https://donnees.montreal.ca/dataset/info-collectes/resource/61e8c7e6-9bf1-45d9-8ebe-d7c0d50cfdbb https://donnees.montreal.ca/dataset/info-collectes/resource/06ec4987-47c9-4f05-a1ae-e164a96699c7 https://donnees.montreal.ca/dataset/info-collectes/resource/d0882022-c74d-4fe2-813d-1aa37f6427c9 https://donnees.montreal.ca/dataset/info-collectes/resource/2345d55a-5325-488c-b4fc-a885fae458e2

import csv
import json
import logging
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

from .collection import Collection  # Make sure Collection is in the same package or specify the correct package path

# Constants for Montreal
import csv
import json
import logging
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Assuming Collection is in the same package or specify the correct package path
from .collection import Collection 

# Constants for Montreal
TITLE = "Montreal (QC)"
DESCRIPTION = "Source for Montreal waste collection"
URL = "https://donnees.montreal.ca/dataset/info-collectes"

DATA_SET_URLS = [
    "https://donnees.montreal.ca/dataset/info-collectes/resource/5f3fb372-64e8-45f2-a406-f1614930305c",
    "https://donnees.montreal.ca/dataset/info-collectes/resource/61e8c7e6-9bf1-45d9-8ebe-d7c0d50cfdbb",
    "https://donnees.montreal.ca/dataset/info-collectes/resource/06ec4987-47c9-4f05-a1ae-e164a96699c7",
    "https://donnees.montreal.ca/dataset/info-collectes/resource/d0882022-c74d-4fe2-813d-1aa37f6427c9",
    "https://donnees.montreal.ca/dataset/info-collectes/resource/2345d55a-5325-488c-b4fc-a885fae458e2",
]

GEOGRAPHIC_API_URL = "https://api.montreal.ca/api/geographic/location/v1/addresses"

# CKAN API endpoint for Montreal waste collection
CKAN_API_URL = "https://donnees.montreal.ca/api/3/action/datastore_search"
RESOURCE_ID = "5f3fb372-64e8-45f2-a406-f1614930305c"  # Replace with actual resource ID

# Icons and pictures
ICON_MAP = {
    "Household waste": "mdi:trash-can",
    "Recycling": "mdi:recycle",
    "Construction debris": "mdi:hammer-screwdriver",
    "Green waste": "mdi:leaf",
    "Food waste": "mdi:food-apple",
    "Organic waste": "mdi:food-apple-outline",
}
PICTURE_MAP = {}

class MontrealCollection:
    def __init__(self, postal_code, civic_number):
        self._postal_code = postal_code
        self._civic_number = civic_number
        self._url = f"https://montreal.ca/en/info-collections/results?postalCode={postal_code}&civicNumber={civic_number}"
        logging.basicConfig(level=logging.INFO)

    def fetch_from_ckan(self):
        params = {
            "resource_id": RESOURCE_ID,
            "limit": 5  # Adjust as needed
        }
        try:
            response = requests.get(CKAN_API_URL, params=params)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            logging.error(f"Failed to get data from CKAN: {e}")
            return None

    def fetch_from_ckan_sql(self, sql_query):
        params = {"sql": sql_query}
        try:
            response = requests.get(CKAN_API_SQL_URL, params=params)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            logging.error(f"Failed to get data from CKAN SQL: {e}")
            return None

    def fetch_address_info(self, address_query):
        params = {"q": address_query}
        try:
            response = requests.get(GEOGRAPHIC_API_URL, params=params)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            logging.error(f"Failed to get address info: {e}")
            return None

    def fetch_from_html(self):
        try:
            response = requests.get(self._url)
            response.raise_for_status()
        except requests.RequestException as e:
            logging.error(f"Failed to get data: {e}")
            return None

        html_content = response.content
        return self.parse_html(html_content)

    def parse_html(self, html_content):
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
        except Exception as e:
            logging.error(f"Failed to parse HTML: {e}")
            return None

        collection_schedule_section = soup.find('div', {'class': 'col-md-8 pl-lg-10'})

        if not collection_schedule_section:
            logging.warning("Relevant section not found. Make sure the class name is correct.")
            return None

        collection_schedules = collection_schedule_section.find_all('li', {'class': 'list-item-icon'})
        schedule_data = {}

        for schedule in collection_schedules:
            icon_label = schedule.find('div', {'class': 'list-item-icon-label'}).text.strip()
            schedule_details = schedule.find('div', {'class': 'list-item-icon-content'}).find('div').text.strip()
            schedule_data[icon_label] = schedule_details

        return schedule_data

    def to_csv(self, schedule_data, filename="montreal_collection_schedule.csv"):
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(['Category', 'Details'])
            for category, details in schedule_data.items():
                csv_writer.writerow([category, details])

def fetch(self):
    # Choose which data source to fetch from
    entries = []  # Initialize an empty list to store Collection objects

    # Fetch from HTML page
    html_data = self.fetch_from_html()
    if html_data:
        # Process HTML data here
        for category, details in html_data.items():
            icon = ICON_MAP.get(category)
            # Create a Collection object and add it to the entries list
            entries.append(Collection(category, details, icon=icon))

        # Write to CSV
        self.to_csv(html_data)

    # Return entries or save them as needed
    return entries
# Example usage
if __name__ == "__main__":
    postal_code = "H3W 1W5"
    civic_number = "5550"
    source = MontrealCollection(postal_code, civic_number)
    source.fetch()
antoinebou12 commented 9 months ago

https://github.com/mampfes/hacs_waste_collection_schedule/pull/1654 @julienboriasse