tomquirk / linkedin-api

👨‍💼Linkedin API for Python
MIT License
1.71k stars 401 forks source link

Profile's interest scraping #345

Open yukiandika opened 9 months ago

yukiandika commented 9 months ago

Hi, I want to scrap the profile's interest (companies that someone follows). I realize that the API on function get_profile doesn't return a response regarding the profile's interest. We should hit a different URL to get the profile's interest. I made a function to solve this problem but still doesn't work.

    def get_interest(self, public_id=None, urn_id=None):
        if urn_id:
            profile_urn = f"urn:li:fsd_profile:{urn_id}"
        else:
            profile = self.get_profile(public_id=public_id)
            profile_urn = profile["profile_urn"].replace("fs_miniProfile", "fsd_profile")

        res = self._fetch(
            f"/graphql?variables=(profileUrn:{profile_urn})"
            f"&&queryId=voyagerIdentityDashProfileCards"
            f".b0928897b71bd00a5a7291755dcd64f0")

        data = res.json()
        return data
        if data and "status" in data and data["status"] != 200:
            self.logger.info("request failed: {}".format(data["message"]))
            return {}
        return data

I think the problem might be on queryID since it is dynamic. Anyone has an idea about it?

ruffinellimarco commented 8 months ago

Curious to know if you ever managed to identify a workaround for this?

I was curious as well and scripted something with selenium to achieve this.

`# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tkinter as tk
from tkinter import simpledialog
import time
from datetime import datetime
from bs4 import BeautifulSoup
import re
import undetected_chromedriver as uc
import pandas as pd
import json
from pprint import pprint

# Adjust pandas display settings
pd.set_option('display.max_colwidth', None)

# Initialize an empty list to store profiles' data
profiles_data = []

# Prompt the user to enter their LinkedIn login credentials.
def get_credentials():
.......

    return email, password

# Login to LinkedIn using Selenium.
def login_to_linkedin(driver, email, password):
    driver.get('https://www.linkedin.com/login')
    driver.find_element(By.ID, 'username').send_keys(email)
    driver.find_element(By.ID, 'password').send_keys(password + Keys.RETURN)
    time.sleep(5)  # Awaiting login completion

# Adjust the given LinkedIn URL to navigate to the specified section.
def adjust_url_for_section(url, section):
    base_url = url.split('?')[0]
    return base_url + f"/details/{section}/"

# Define the list of tabs to extract data from
tabs = ["Top Voices", "Companies", "Groups", "Newsletters", "Schools"]

def click_tab_and_extract_data(driver, tab_name):
    try:
        # Click on the tab
        tab_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f"//button[span[contains(text(), '{tab_name}')]]"))
        )
        tab_element.click()

        # Wait for the content to load and extract names or entities
        names_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".display-flex.align-items-center.mr1.hoverable-link-text.t-bold span[aria-hidden='true']"))
        )

        names = [element.text for element in names_elements if element.text.strip() != '']  # Filter out empty names/entities
        return names
    except:
        print(f"Couldn't extract data from {tab_name} tab or tab not present.")
        return []

# Create an empty list to store all the flattened profiles
flattened_profiles = []

try:
    for url in urls:
        if interrupted:
            break  # Exit the loop if interrupted

        driver.get(url)
        time.sleep(3)

        # Dictionary to store the profile's data
        profile_data = {
            "LinkedIn URL": url  # Add the LinkedIn URL as an identifier
        }

  .....

        # Extract interests
        interests_url = adjust_url_for_section(url, 'interests')
        driver.get(interests_url)
        time.sleep(3)

        interests_data = {}
        for tab_name in tabs:
            interests_data[tab_name] = click_tab_and_extract_data(driver, tab_name)
        profile_data['Interests'] = interests_data

        # After successfully scraping a profile, add its timestamp to the list
        scraped_timestamps.append(datetime.datetime.now())

        # Flatten the data and append to the list
        flattened_profiles.append(flatten_data(profile_data))

        profiles_data.append(profile_data)

        # Pretty print the profile data
        print("\nExtracted Data for Profile:")
        pprint(profile_data)
        print("\n" + "-"*50 + "\n")

except KeyboardInterrupt:
    interrupted = True
    print("Script manually stopped. Data saved up to this point.")

# After all URLs are processed, write all data to Excel file
df_temp = pd.DataFrame(flattened_profiles)

# Read the existing data
df_existing = pd.read_excel(output_file)

# Append the new data
df_final = pd.concat([df_existing, df_temp], ignore_index=True)

# Save the combined data
df_final.to_excel(output_file, index=False)

# Save the extracted data to a JSON file
with open("xxxxxxxxxxxx", 'w', encoding='utf-8') as json_file:
    json.dump(profiles_data, json_file, ensure_ascii=False, indent=4)

while True:
    time.sleep(10)`