amineboutarfi / google_maps_scraper

MIT License
76 stars 46 forks source link

Unable to display complete details #14

Closed anirudh-hegde closed 5 months ago

anirudh-hegde commented 5 months ago
# This script serves as an example of how to use Python
# & Playwright to scrape/extract data from Google Maps

from playwright.sync_api import sync_playwright
from dataclasses import dataclass, asdict, field
import pandas as pd
import argparse
import os
import sys

@dataclass
class Business:
    """Holds business data"""

    name: str = None
    address: str = None
    website: str = None
    phone_number: str = None
    reviews_count: int = None
    reviews_average: float = None
    latitude: float = None
    longitude: float = None
    # hours: str = None
    category: str = None

@dataclass
class BusinessList:
    """Holds a list of Business objects,
    and saves to both excel and csv
    """
    business_list: list[Business] = field(default_factory=list)
    save_at = 'output'

    def dataframe(self):
        """Transform business_list to pandas dataframe

        Returns: pandas dataframe
        """
        return pd.json_normalize(
            (asdict(business) for business in self.business_list), sep="_"
        )

    def save_to_excel(self, filename):
        """Saves pandas dataframe to an excel (xlsx) file

        Args:
            filename (str): filename
        """

        if not os.path.exists(self.save_at):
            os.makedirs(self.save_at)
        self.dataframe().to_excel(f"output/{filename}.xlsx", index=False)

    def save_to_csv(self, filename):
        """Saves pandas dataframe to a csv file

        Args:
            filename (str): filename
        """

        if not os.path.exists(self.save_at):
            os.makedirs(self.save_at)
        self.dataframe().to_csv(f"output/{filename}.csv", index=False)

def extract_coordinates_from_url(url: str) -> tuple[float, float]:
    """Helper function to extract coordinates from URL"""

    coordinates = url.split('/@')[-1].split('/')[0]
    # return latitude, longitude
    return float(coordinates.split(',')[0]), float(coordinates.split(',')[1])

def main():
    # Read search from arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--search", type=str)
    parser.add_argument("-t", "--total", type=int)
    args = parser.parse_args()

    if args.search:
        search_list = [args.search]

    if args.total:
        total = args.total
    else:
        # If no total is passed, we set the value to a random big number
        total = 1_000_000

    if not args.search:
        # If no search term is provided, use a default list of categories
        # categories = ["restaurant", "hotel", "pharmacy", "gym", "bank"]
        categories = ["gym"]
        search_list = categories

    # Scraping
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        page = browser.new_page()

        # Loop through specified categories
        for search_for_index, search_for in enumerate(search_list):
            print(f"-----\n{search_for_index} - {search_for}".strip())

            page.goto("https://www.google.com/maps", timeout=60000)
            # Wait is added for the dev phase. You can remove it in production
            page.wait_for_timeout(5000)

            page.locator('//input[@id="searchboxinput"]').fill(search_for)
            page.wait_for_timeout(3000)

            page.keyboard.press("Enter")
            page.wait_for_timeout(5000)

            # Scrolling
            page.hover('//a[contains(@href, "https://www.google.com/maps/place")]')

            # This variable is used to detect if the bot
            # scraped the same number of listings in the previous iteration
            previously_counted = 0
            while True:
                page.mouse.wheel(0, 10000)
                page.wait_for_timeout(3000)

                if (
                        page.locator(
                            '//a[contains(@href, "https://www.google.com/maps/place")]'
                        ).count()
                        >= total
                ):
                    listings = page.locator(
                        '//a[contains(@href, "https://www.google.com/maps/place")]'
                    ).all()[:total]
                    listings = [listing.locator("xpath=..") for listing in listings]
                    print(f"Total Scraped: {len(listings)}")
                    break
                else:
                    # Logic to break from the loop to not run infinitely
                    # in case arrived at all available listings
                    if (
                            page.locator(
                                '//a[contains(@href, "https://www.google.com/maps/place")]'
                            ).count()
                            == previously_counted
                    ):
                        listings = page.locator(
                            '//a[contains(@href, "https://www.google.com/maps/place")]'
                        ).all()
                        print(f"Arrived at all available\nTotal Scraped: {len(listings)}")
                        break
                    else:
                        previously_counted = page.locator(
                            '//a[contains(@href, "https://www.google.com/maps/place")]'
                        ).count()
                        print(
                            f"Currently Scraped: ",
                            page.locator(
                                '//a[contains(@href, "https://www.google.com/maps/place")]'
                            ).count(),
                        )

            business_list = BusinessList()

            # Scraping
            for listing in listings:
                try:
                    listing.click()
                    page.wait_for_timeout(5000)

                    # name_xpath = '//div[contains(@class, "fontHeadlineSmall")]'
                    name_xpath = '//h1[@data-section-id="title"]'
                    address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]'
                    website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]'
                    phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]'
                    reviews_span_xpath = '//span[@role="img"]'

                    # name_xpath = '//h1[@data-section-id="title"]'
                    # reviews_count_xpath = '//button[contains(@aria-label, "reviews")]/span[contains(@class, "gm2-caption")]'

                    # name_xpath = '//div[contains(@class, "fontHeadlineSmall")]'
                    # address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]'
                    # website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]'
                    # phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]'
                    # reviews_span_xpath = '//span[@role="img"]'
                    # reviews_count_xpath = '//button[contains(@aria-label, "reviews")]/span[contains(@class, "gm2-caption")]'

                    business = Business()

                    # if page.locator(name_xpath).count() > 0:
                    #     business.name = page.locator(name_xpath).all()[0].inner_text()
                    # else:
                    #     business.name = ""

                    if page.locator(name_xpath).count() > 0:
                        business.name = page.locator(name_xpath).all()[0].inner_text()
                        print(f"Extracted name: {business.name}")
                    else:
                        business.name = ""
                    if page.locator(address_xpath).count() > 0:
                        business.address = page.locator(address_xpath).all()[0].inner_text()
                    else:
                        business.address = ""
                    if page.locator(website_xpath).count() > 0:
                        business.website = page.locator(website_xpath).all()[0].inner_text()
                    else:
                        business.website = ""
                    if page.locator(phone_number_xpath).count() > 0:
                        business.phone_number = page.locator(phone_number_xpath).all()[0].inner_text()
                    else:
                        business.phone_number = ""
                    # if page.locator(reviews_count_xpath).count() > 0:
                    #     reviews_count_text = page.locator(reviews_count_xpath).all()[0].inner_text().strip()
                    #     business.reviews_count = int(reviews_count_text.replace(',', ''))
                    # else:
                    #     business.reviews_count = 0

                    if listing.locator(reviews_span_xpath).count() > 0:
                        business.reviews_average = float(
                            listing.locator(reviews_span_xpath).all()[0]
                            .get_attribute("aria-label")
                            .split()[0]
                            .replace(",", ".")
                            .strip()
                        )
                        business.reviews_count = int(
                            listing.locator(reviews_span_xpath).all()[0]
                            .get_attribute("aria-label")
                            .split()[2]
                            .replace(',', '')
                            .strip()
                        )
                    else:
                        business.reviews_average = ""
                        business.reviews_count = ""

                    business.latitude, business.longitude = extract_coordinates_from_url(page.url)

                    business.category = search_for  # Set the category

                    business_list.business_list.append(business)
                except Exception as e:
                    print(f'Error occurred: {e}')

            # Output
            business_list.save_to_excel(f"google_maps_data_{search_for}".replace(' ', '_'))
            business_list.save_to_csv(f"google_maps_data_{search_for}".replace(' ', '_'))

        browser.close()

if __name__ == "__main__":
    main()

image It's not displaying name, website, views_count, views_average

hosythach commented 5 months ago

hey bro @anirudh-hegde , You can try again with this code

anirudh-hegde commented 5 months ago

hey bro @anirudh-hegde , You can try again with this code

Ya