Open itsa-mee-mario opened 1 year ago
I think you might have an older version of the code. I don't see that line in the current person.py
Can you provide the code you've used please?
I think profile scraping just doesn't work right now since person.py hasn't been updated in two months. LinkedIn probably changed some of their HTML schema so hopefully a new commit is released soon.
Hey - I only updated two functions as I needed: get_experiences() and get_name_and_location(). In addition to UI updates I also fixed the scraper issue where it gets confused when a person has multiple positions at the same company over time.
You can selectively scrape by doing this: person=Person("https://www.linkedin.com/in/sheanahamill", driver=driver, scrape=False) person.get_experiences() print(person.experiences)
def get_name_and_location(self):
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
top_panels = main.find_elements(By.CLASS_NAME,"pv-text-details__left-panel")
self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text
self.location = top_panels[1].find_element(By.TAG_NAME,"span").text
def get_experiences(self): # modified
url = os.path.join(self.linkedin_url, "details/experience")
self.driver.get(url)
self.focus()
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
self.scroll_to_half()
self.scroll_to_bottom()
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
for position in main_list.find_elements(By.XPATH,"li"):
position = position.find_element(By.CLASS_NAME,"pvs-entity")
company_logo_elem, position_details = position.find_elements(By.XPATH,"*")
# company elem
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
# position details
position_details_list = position_details.find_elements(By.XPATH,"*")
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None # skills OR list of positions
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
if len(outer_positions) == 4:
position_title = outer_positions[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].text
company = outer_positions[1].find_element(By.TAG_NAME,"span").text
work_times = outer_positions[2].find_element(By.TAG_NAME,"span").text
location = outer_positions[3].find_element(By.TAG_NAME,"span").text
elif len(outer_positions) == 3:
if "·" in outer_positions[2].text:
position_title = outer_positions[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].text
company = outer_positions[1].find_element(By.TAG_NAME,"span").text
work_times = outer_positions[2].find_element(By.TAG_NAME,"span").text
location = ""
else:
position_title = ""
company = outer_positions[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].text
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
elif len(outer_positions) == 2: # this is for when person has multiple pos over time at one company
company_div, work_times_div = outer_positions
company = company_div.find_element(By.TAG_NAME,"span").text
company_linkedin_url = ""
print(colored(company, 'yellow'))
positions_list = position_summary_text.find_element(By.CLASS_NAME, "pvs-list").find_element(By.CLASS_NAME, "pvs-list")
for position in positions_list.find_elements(By.XPATH,"*"):
print(colored('count position', "yellow"))
position = position.find_element(By.CLASS_NAME,"pvs-entity")
position_details_list = position.find_elements(By.XPATH,"*")[1].find_elements(By.XPATH,"*")
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None # skills OR list of positions
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
if len(outer_positions) == 3:
position_title = outer_positions[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].find_elements(By.XPATH,"*")[0].text
print(colored(position_title, 'yellow'))
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
else:
print('need fix.')
if 'work_times' not in locals() and 'work_times' not in globals():
work_times = None # modified
times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if times != "" and len(work_times.split("·")) > 1 else None # modified
from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""
if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
for description in descriptions:
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
position_title_elem = res[0] if len(res) > 0 else None
work_times_elem = res[1] if len(res) > 1 else None
location_elem = res[2] if len(res) > 2 else None
location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""
times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""
experience = Experience(
position_title=position_title,
from_date=from_date,
to_date=to_date,
duration=duration,
location=location,
description=description,
institution_name=company if 'company' in locals() or 'company' in globals() else "Not provided", #modified
linkedin_url=company_linkedin_url
)
self.add_experience(experience)
else:
description = position_summary_text.text if position_summary_text else ""
experience = Experience(
position_title=position_title,
from_date=from_date,
to_date=to_date,
duration=duration,
location=location,
description=description,
institution_name=company,
linkedin_url=company_linkedin_url
)
self.add_experience(experience)
return
if 'work_times' not in locals() and 'work_times' not in globals():
work_times = None
times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if times != "" and len(work_times.split("·")) > 1 else None
from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""
if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
for description in descriptions:
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
position_title_elem = res[0] if len(res) > 0 else None
work_times_elem = res[1] if len(res) > 1 else None
location_elem = res[2] if len(res) > 2 else None
location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""
times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""
experience = Experience(
position_title=position_title,
from_date=from_date,
to_date=to_date,
duration=duration,
location=location,
description=description,
institution_name=company if 'company' in locals() or 'company' in globals() else "Not provided",
linkedin_url=company_linkedin_url
)
self.add_experience(experience)
else:
description = position_summary_text.text if position_summary_text else ""
experience = Experience(
position_title=position_title,
from_date=from_date,
to_date=to_date,
duration=duration,
location=location,
description=description,
institution_name=company,
linkedin_url=company_linkedin_url
)
self.add_experience(experience)
This is from ~ a week ago, hopefully still working.
Hey, it got rid of the older error but now I'm getting the "need fix" line printed, any updates?
I encountered a similar issue when scraping another profile: "UnboundLocalError: cannot access the local variable 'work_times' when it is not assigned a value."
This error likely arises because the 'work_times' variable is accessed before it is given a value in all code paths. Depending on the logic, there might be cases where 'work_times' remains undefined. Out of curiosity, @alicemy478 and I are investigating the code to identify the error and any disparities with LinkedIn's HTML structure. It may be worth considering a switch to BeautifulSoup to rely on more consistent HTML parsing.
im facing the following error for profile scraping:
and here is the traceback:
i have tried updating the position_name variable as mentioned in another issue.