Closed jamieglohere closed 1 year ago
I think I did it:
Also consider using a generator object instead of creating a separate list
lrl_placeholder
... if I can figure out how to make it work.
Even better, I was able to rewrite the function so that it doesn't have to rely on a second list or generator (reference).
#
# OpenStreetMap checker of LOCALE_1
# note that locale_1 collection process is different between single-locale and multi-locale links
#
def osm_check(locale_1, data):
#
# when we have this abacq locale, we then cross-check this with OpenStreetMap
# first, do a specific search for 'Salvador Allende'
#
locale_link = f'https://www.openstreetmap.org/search?query=Salvador%20Allende%20{locale_1}%20{country_en}'
driver.get(locale_link)
# humanizer fixes the problem of the script getting no OSM info sometimes when you can see in the browser that there actually is
humanizer(timer)
osm_soup = BeautifulSoup(driver.page_source, 'html.parser', parse_only=SoupStrainer("ul", class_="results-list list-group list-group-flush"))
#
# go through each search result and have the user verify it
#
locale_results_list = []
locale_results_list.extend(list(osm_soup.find_all("a", class_="set_position")))
#
# if the first search has no results, try a more general search for 'Allende'
#
if len(locale_results_list) == 0:
locale_link = f'https://www.openstreetmap.org/search?query=Allende%20{locale_1}%20{country_en}'
driver.get(locale_link)
# humanizer fixes the problem of the script getting no OSM info sometimes when you can see in the browser that there actually is
humanizer(timer)
osm_soup = BeautifulSoup(driver.page_source, 'html.parser', parse_only=SoupStrainer("ul", class_="results-list list-group list-group-flush"))
locale_results_list.extend(list(osm_soup.find_all("a", class_="set_position")))
#
# a single result looks like this - we can derive lots of info from here once user verifies that it looks good
#
# <a class="set_position" data-lat="-12.1102763" data-lon="-77.0104283"
# data-min-lat="-12.1103037" data-max-lat="-12.1102452" data-min-lon="-77.0109212" data-max-lon="-77.0097999"
# data-prefix="Residential Road" data-name="Salvador Allende, Villa Victoria, Surquillo, Province of Lima, Lima Metropolitan Area, Lima, 15000, Peru"
# data-type="way" data-id="426845566" href="/way/426845566">Salvador Allende, Villa Victoria, Surquillo, Province of Lima, Lima Metropolitan Area, Lima, 15000, Peru</a>
#
if len(locale_results_list) == 0:
print('No addresses found in OpenStreetMap. Will use the locale derived from the article...')
data['locale_1'].append(locale_1)
print(f'Locale 1: {locale_1}')
# clear the previous entry's osm_address and osm_info so that it doesn't get copied into the current entry
global osm_address
osm_address = ''
global osm_info
osm_info = ''
else:
print(f'{str(len(locale_results_list))} possible address(es) found in OpenStreetMap.')
for result in locale_results_list:
result = str(result)
osm_address = re.search(r'>\"*(.*)\"*<\/a>', result)
osm_address = str(osm_address.group(1))
#
# have user verify the address - this decides what this loop should do next
#
print(f'Please verify if this address matches the place in this article:\n{osm_address}')
user_verification = input('>>> Type y if yes, n if no: ')
# typo prevention
while user_verification != 'n' and user_verification != 'y':
user_verification = input('>>> Try again - Type y if yes, n if no: ')
#
# if there is only one result and it doesn't match the article's place
#
if user_verification == 'n' and len(locale_results_list) == 1:
print('OpenStreetMap address does not match the place in this article. Will use the locale derived from the article...')
# clear the previous entry's osm_address and osm_info so that it doesn't get copied into the current entry
osm_address = ''
osm_info = ''
data['locale_1'].append(locale_1)
print(f'Locale 1: {locale_1}')
break
#
# if result matches article's place
#
elif user_verification == 'y':
# we'll save the whole result in a variable for later parsing. we can then close the loop.
osm_info = result
break
#
# if there are more than one result and we haven't exhausted the loop yet
#
elif user_verification == 'n' and len(locale_results_list) > 1:
# clear the previous entry's osm_address and osm_info so that it doesn't get copied into the current entry
osm_address = ''
osm_info = ''
continue
#
# if we have exhausted all list items and none of them matches the place
#
else:
print('All OpenStreetMap addresses do not match the place in this article. Will use the locale derived from the article...')
# clear the previous entry's osm_address and osm_info so that it doesn't get copied into the current entry
osm_address = ''
osm_info = ''
# nothing else we can do but add the default locale_1
data['locale_1'].append(locale_1)
print(f'Locale 1: {locale_1}')
#
# stay in the web page like a normal human would
#
humanizer(timer)
#
# then go on with our automated lives
#
# when we have osm_info, we'll take locale details from its osm_address by splitting it.
# sample split:
# ['Salvador Allende', 'Villa Victoria', 'Surquillo', 'Province of Lima', 'Lima Metropolitan Area', 'Lima', '15000', 'Peru']
# index 0 is the place's name, -1 is the country, -2 is the zip code, -3 is locale_1, etc...
#
try:
osm_address = osm_address.split(', ')
locale_1 = osm_address[-3]
data['locale_1'].append(locale_1)
print(f'Locale 1: {locale_1}')
except:
pass
Workflow:
salvador allende {locale} {country}
locale_results_list
and append the search results hereallende {locale} {country}
and append the search results to the listAlso consider using a generator object instead of creating a separate list
lrl_placeholder
... if I can figure out how to make it work.Use some Mexico links for testing.