Closed joaquinmenendez closed 2 years ago
In the line 202 of googlemaps.py
I replaced the css selector for:
div.section-layout.section-scrollbox.mapsConsumerUiCommonScrollable__scrollable-y.mapsConsumerUiCommonScrollable__scrollable-show
and It worked.
Nevertheless, I don´t know if this is a new change in their website or because I have IOS. Feel free to check! If I have time I will try to test it.
I use Windows, and edit googlemaps like below can run. `
class GoogleMapsScraper:
def __init__(self, debug=False):
self.debug = debug
self.driver = self.__get_driver()
self.logger = self.__get_logger()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, tb):
if exc_type is not None:
traceback.print_exception(exc_type, exc_value, tb)
self.driver.close()
self.driver.quit()
return True
def sort_by(self, url, ind):
self.driver.get(url)
wait = WebDriverWait(self.driver, MAX_WAIT)
# open dropdown menu
clicked = False
tries = 0
while not clicked and tries < MAX_RETRY:
try:
#if not self.debug:
# menu_bt = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.cYrDcjyGO77__container')))
#else:
menu_bt = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@data-value=\'Sort\']')))
menu_bt.click()
clicked = True
time.sleep(3)
except Exception as e:
tries += 1
self.logger.warn('Failed to click recent button')
# failed to open the dropdown
if tries == MAX_RETRY:
return -1
# element of the list specified according to ind
recent_rating_bt = self.driver.find_elements_by_xpath('//li[@role=\'menuitemradio\']')[ind]
recent_rating_bt.click()
# wait to load review (ajax call)
time.sleep(5)
return 0
def get_reviews(self, offset):
# scroll to load reviews
# wait for other reviews to load (ajax)
time.sleep(4)
self.__scroll()
# expand review text
self.__expand_reviews()
# parse reviews
response = BeautifulSoup(self.driver.page_source, 'html.parser')
rblock = response.find_all('div', class_='mapsConsumerUiSubviewSectionReview__section-review-content')
parsed_reviews = []
for index, review in enumerate(rblock):
if index >= offset:
#print(review)
parsed_reviews.append(self.__parse(review))
print(self.__parse(review))
return parsed_reviews
def get_account(self, url):
self.driver.get(url)
# ajax call also for this section
time.sleep(4)
resp = BeautifulSoup(self.driver.page_source, 'html.parser')
place_data = self.__parse_place(resp)
return place_data
def __parse(self, review):
item = {}
#id_review = review.find('button', class_='section-review-action-menu mapsConsumerUiSubviewSectionReview__section-review-action-menu-with-title')['data-review-id']
#username = review.find('div', class_='section-review-title').find('span').text
try:
review_text = self.__filter_string(review.find('span', class_='section-review-text').text)
except Exception as e:
review_text = None
rating = float(review.find('span', class_='section-review-stars')['aria-label'].split(' ')[1])
relative_date = review.find('span', class_='section-review-publish-date').text
try:
n_reviews_photos = review.find('div', class_='section-review-subtitle').find_all('span')[1].text
metadata = n_reviews_photos.split('\xe3\x83\xbb')
if len(metadata) == 3:
n_photos = int(metadata[2].split(' ')[0].replace('.', ''))
else:
n_photos = 0
idx = len(metadata)
n_reviews = int(metadata[idx - 1].split(' ')[0].replace('.', ''))
except Exception as e:
n_reviews = 0
n_photos = 0
user_url = review.find('a')['href']
#item['id_review'] = id_review
item['caption'] = review_text
# depends on language, which depends on geolocation defined by Google Maps
# custom mapping to transform into date shuold be implemented
item['relative_date'] = relative_date
# store datetime of scraping and apply further processing to calculate
# correct date as retrieval_date - time(relative_date)
item['retrieval_date'] = datetime.now()
item['rating'] = rating
#item['username'] = username
item['n_review_user'] = n_reviews
item['n_photo_user'] = n_photos
item['url_user'] = user_url
return item
def __parse_place(self, response):
place = {}
try:
place['overall_rating'] = float(response.find('div', class_='gm2-display-2').text.replace(',', '.'))
except:
place['overall_rating'] = 'NOT FOUND'
try:
place['n_reviews'] = int(response.find('div', class_='gm2-caption').text.replace('.', '').replace(',','').split(' ')[0])
except:
place['n_reviews'] = 0
return place
# expand review description
def __expand_reviews(self):
# use XPath to load complete reviews
links = self.driver.find_elements_by_xpath('//button[@class=\'mapsConsumerUiSubviewSectionReview__section-expand-review mapsConsumerUiCommonButton__blue-link\']')
for l in links:
l.click()
time.sleep(2)
# load more reviews
def more_reviews(self):
# use XPath to load complete reviews
#allxGeDnJMl__text gm2-button-alt
#<button ved="1i:1,t:18519,e:0,p:kPkcYIz-Dtql-QaL1YawDw:1969" jstcache="1202" jsaction="pane.reviewChart.moreReviews" class="gm2-button-alt jqnFjrOWMVU__button-blue" jsan="7.gm2-button-alt,7.jqnFjrOWMVU__button-blue,0.ved,22.jsaction">14 reviews</button>
#<button aria-label="14 reviews" vet="3648" jsaction="pane.rating.moreReviews" jstcache="1010" class="widget-pane-link" jsan="7.widget-pane-link,0.aria-label,0.vet,0.jsaction">14 reviews</button>
links = self.driver.find_elements_by_xpath('//button[@jsaction=\'pane.reviewChart.moreReviews\']')
print('LINKS HERE', links)
for l in links:
l.click()
time.sleep(2)
def __scroll(self):
scrollable_div = self.driver.find_element_by_css_selector('div.section-layout.section-scrollbox.mapsConsumerUiCommonScrollable__scrollable-y.mapsConsumerUiCommonScrollable__scrollable-show')
self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
#self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
def __get_logger(self):
# create logger
logger = logging.getLogger('googlemaps-scraper')
logger.setLevel(logging.DEBUG)
# create console handler and set level to debug
fh = logging.FileHandler('gm-scraper.log')
fh.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# add formatter to ch
fh.setFormatter(formatter)
# add ch to logger
logger.addHandler(fh)
return logger
def __get_driver(self, debug=False):
options = Options()
if not self.debug:
options.add_argument("--headless")
else:
options.add_argument("--window-size=1366,768")
options.add_argument("--disable-notifications")
options.add_argument("--lang=en-GB")
input_driver = webdriver.Chrome(chrome_options=options)
return input_driver
# util function to clean special characters
def __filter_string(self, str):
strOut = str.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
return strOut`
@ke511081177 Do you happen to have a working version of this tool? Been trying to get it to work but nothing :(
Hi,
latest merge and commits to master solved the problem, now the tool should work as expected. Please let me know if you discover any other weird behaviours and send pull requests if you want to contribute.
Thank you, Mattia
I am having the following issue when running the example command:
"selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.section-layout.section-scrollbox.scrollable-y.scrollable-show"} (Session info: headless chrome=90.0.4430.93)"
I tried this URL = "https://www.google.com/maps/place/Nike+Alto+Palermo/@-34.5883936,-58.4098625,15z/data=!4m7!3m6!1s0x0:0x842bcbef147891ee!8m2!3d-34.588376!4d-58.4098447!9m1!1b1"
But the same happens when I try to use the URLs at
urls.txt
I am using a Mac and installed chromedrive using brew