amineboutarfi / google_maps_scraper

MIT License
79 stars 50 forks source link

Mis modificaciones al código. #2

Closed Arlexg closed 1 year ago

Arlexg commented 1 year ago

"""This script serves as an example on how to use Python & Playwright to scrape/extract data from Google Maps"""

from playwright.sync_api import sync_playwright from dataclasses import dataclass, asdict, field import pandas as pd import argparse from typing import List

@dataclass class Business: """holds business data """ name: str = None image: str = None category: str = None address: str = None website: str = None phone_number: str = None reviews_count: int = None reviews_average: float = None horario: List[str] = None

@dataclass class BusinessList: """holds list of Business objects, and save to both excel and csv """ business_list: list[Business] = field(default_factory=list)

def dataframe(self):
    """transform business_list to pandas dataframe 

    Returns: pandas dataframe
    """
    return pd.json_normalize((asdict(business) for business in self.business_list), sep="_")

def save_to_excel(self, filename):
    """saves pandas dataframe to excel (xlsx) file

    Args:
        filename (str): filename
    """
    self.dataframe().to_excel(f'{filename}.xlsx', index=False)

def save_to_csv(self, filename):
    """saves pandas dataframe to csv file

    Args:
        filename (str): filename
    """
    self.dataframe().to_csv(f'{filename}.csv', index=False)

def main():

with sync_playwright() as p:

    browser = p.chromium.launch(headless=False)
    page = browser.new_page()
    page.set_viewport_size({"width": 2020, "height": 2180})

    page.goto('https://www.google.us/maps', timeout=60000)

    # Esperar a que la página cargue completamente
    page.wait_for_selector('#searchboxinput')

    # wait is added for dev phase. can remove it in production
    page.wait_for_timeout(5000)

    page.locator('//input[@id="searchboxinput"]').fill(search_for)
    page.wait_for_timeout(3000)

    page.keyboard.press('Enter')
    page.wait_for_timeout(5000)

    # scrolling
    page.hover('(//div[@role="feed"])[1]')
    page.set_viewport_size({"width": 2020, "height": 2180})
    # this variable is used to detect if the bot
    # scraped the same number of listings in the previous iteration
    previously_counted = 0
    while True:
        page.mouse.wheel(0, 10000)
        page.wait_for_timeout(3000)

        if page.locator('//div[@role="feed"]').count() >= total:
            listings = page.locator('//div[@role="feed"]').all()[:total]
            print(f'Total Scraped: {len(listings)}')
            break
        else:
            # logic to break from loop to not run infinitely
            # in case arrived at all available listings
            if page.locator('//div[@role="article"]').count() == previously_counted:
                listings = page.locator('//div[@role="article"]').all()
                print(f'Arrived at all available\nTotal Scraped: {len(listings)}')
                break
            else:
                previously_counted = page.locator('//div[@role="article"]').count()
                print(f'Currently Scraped: ', page.locator('//div[@role="article"]').count())

    business_list = BusinessList()

    # scraping
    for listing in listings:

        listing.click()
        page.wait_for_timeout(2000)

        name_xpath = '//h1[contains(@class, "fontHeadlineLarge")]'
        image_xpath = '//div[contains(@class, "ZKCDEc")]/div/button/img'
        category_xpath = '//button[contains(@class, "DkEaL") and contains(@jsaction, "pane.rating.category")]'
        address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]'
        website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]'
        phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]'
        reviews_span_xpath = '//span[@role="img"]'
        horario_button_xpath = '//span[@aria-label="Mostrar el horario de la semana"]'
        horario_xpath = '//table[contains(@class, "fontBodyMedium")]//tr'
        Reseñas_button_xpath = '//button[contains(@aria-label, "Reseñas")]'
        Información_button_xpath = '//button[contains(@aria-label, "Información")]'

        business = Business()

        if page.locator(name_xpath).count() > 0:
            business.name = page.locator(name_xpath).inner_text()
        else:
            business.name = ''
        if page.locator(image_xpath).count() > 0:
            business.image = page.locator(image_xpath).nth(0).get_attribute('src')
        else:
            business.image = ''
        if page.locator(category_xpath).count() > 0:
            business.category = page.locator(category_xpath).inner_text()
        else:
            business.category = ''
        if page.locator(address_xpath).count() > 0:
            business.address = page.locator(address_xpath).inner_text()
        else:
            business.address = ''
        if page.locator(website_xpath).count() > 0:
            business.website = page.locator(website_xpath).inner_text()
        else:
            business.website = ''
        if page.locator(phone_number_xpath).count() > 0:
            business.phone_number = page.locator(phone_number_xpath).inner_text()
        else:
            business.phone_number = ''
        if listing.locator(reviews_span_xpath).count() > 0:
            business.reviews_average = float(listing.locator(reviews_span_xpath).get_attribute('aria-label').split()[0].replace(',', '.').strip())
            business.reviews_count = float(listing.locator(reviews_span_xpath).get_attribute('aria-label').split()[2].strip())
        else:
            business.reviews_average = ''
            business.reviews_count = ''

        try:
            if page.locator(horario_button_xpath).count() > 0:
                page.locator(horario_button_xpath).click()
                page.wait_for_timeout(1000)
        except:
            pass   

        horario_elements = page.locator(horario_xpath).all()

        horario = []
        for element in horario_elements:
            horario.append(element.inner_text())

        business.horario = horario

        try:
            if page.locator(Reseñas_button_xpath).count() > 0:
               page.locator(Reseñas_button_xpath).click()
               page.wait_for_timeout(3000)
        except:
            pass

        try:
            if page.locator(Reseñas_button_xpath).count() > 0:
                page.locator(Reseñas_button_xpath).click()
                page.wait_for_timeout(10000)

                # Agrega aquí el código para extraer las reseñas
                reviews_xpath = '//div[@class="review-container"]'
                reviews = page.locator(reviews_xpath).all()

                for review in reviews:
                    # Extraer información de cada reseña
                    rating_xpath = './/span[@class="section-star-display"]'
                    rating = review.locator(rating_xpath).inner_text()

                    author_name_xpath = './/div[@class="section-review-title"]/span[contains(@class, "section-review-title")]/span[contains(@class, "section-review-title")]/span'
                    author_name = review.locator(author_name_xpath).inner_text()

                    author_photo_xpath = './/div[contains(@class, "section-review-title")]/div/img'
                    author_photo = review.locator(author_photo_xpath).get_attribute('src')

                    comment_xpath = './/div[@class="section-review-review-content"]'
                    comment = review.locator(comment_xpath).inner_text()

                    print("Rating:", rating)
                    print("Author Name:", author_name)
                    print("Author Photo:", author_photo)
                    print("Comment:", comment)
                    print("\n")

        except:
            pass

        try:
            if page.locator(Información_button_xpath).count() > 0:
               page.locator(Información_button_xpath).click()
               page.wait_for_timeout(3000)
        except:
            pass

        business_list.business_list.append(business)

    # saving to both excel and csv just to showcase the features.
    business_list.save_to_excel('google_maps_data')
    business_list.save_to_csv('google_maps_data')

    browser.close()

if name == "main":

parser = argparse.ArgumentParser()
parser.add_argument("-s", "--search", type=str)
parser.add_argument("-t", "--total", type=int)
args = parser.parse_args()

if args.search:
    search_for = args.search
else:
    # in case no arguments passed
    # the scraper will search by default for:
    search_for = 'clinicas veterinarias, new york, united state'

# total number of products to scrape. Default is 10
if args.total:
    total = args.total
else:
    total = 10

main()
amineboutarfi commented 1 year ago

Thanks! Those features are useful, will integrate them later.

Advice: try to not use @aria-label in xpaths because Google Maps displays the language of the browser. In your case it's in Spanish (example: Mostrar el horario de la semana). If another is using another language, it won't work!