Profile's interest scraping

Curious to know if you ever managed to identify a workaround for this?
I was curious as well and scripted something with selenium to achieve this.
`# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tkinter as tk
from tkinter import simpledialog
import time
from datetime import datetime
from bs4 import BeautifulSoup
import re
import undetected_chromedriver as uc
import pandas as pd
import json
from pprint import pprint

# Adjust pandas display settings
pd.set_option('display.max_colwidth', None)

# Initialize an empty list to store profiles' data
profiles_data = []

# Prompt the user to enter their LinkedIn login credentials.
def get_credentials():
.......

    return email, password

# Login to LinkedIn using Selenium.
def login_to_linkedin(driver, email, password):
    driver.get('https://www.linkedin.com/login')
    driver.find_element(By.ID, 'username').send_keys(email)
    driver.find_element(By.ID, 'password').send_keys(password + Keys.RETURN)
    time.sleep(5)  # Awaiting login completion

# Adjust the given LinkedIn URL to navigate to the specified section.
def adjust_url_for_section(url, section):
    base_url = url.split('?')[0]
    return base_url + f"/details/{section}/"

# Define the list of tabs to extract data from
tabs = ["Top Voices", "Companies", "Groups", "Newsletters", "Schools"]

def click_tab_and_extract_data(driver, tab_name):
    try:
        # Click on the tab
        tab_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f"//button[span[contains(text(), '{tab_name}')]]"))
        )
        tab_element.click()

        # Wait for the content to load and extract names or entities
        names_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".display-flex.align-items-center.mr1.hoverable-link-text.t-bold span[aria-hidden='true']"))
        )

        names = [element.text for element in names_elements if element.text.strip() != '']  # Filter out empty names/entities
        return names
    except:
        print(f"Couldn't extract data from {tab_name} tab or tab not present.")
        return []

# Create an empty list to store all the flattened profiles
flattened_profiles = []

try:
    for url in urls:
        if interrupted:
            break  # Exit the loop if interrupted

        driver.get(url)
        time.sleep(3)

        # Dictionary to store the profile's data
        profile_data = {
            "LinkedIn URL": url  # Add the LinkedIn URL as an identifier
        }

  .....

        # Extract interests
        interests_url = adjust_url_for_section(url, 'interests')
        driver.get(interests_url)
        time.sleep(3)

        interests_data = {}
        for tab_name in tabs:
            interests_data[tab_name] = click_tab_and_extract_data(driver, tab_name)
        profile_data['Interests'] = interests_data

        # After successfully scraping a profile, add its timestamp to the list
        scraped_timestamps.append(datetime.datetime.now())

        # Flatten the data and append to the list
        flattened_profiles.append(flatten_data(profile_data))

        profiles_data.append(profile_data)

        # Pretty print the profile data
        print("\nExtracted Data for Profile:")
        pprint(profile_data)
        print("\n" + "-"*50 + "\n")

except KeyboardInterrupt:
    interrupted = True
    print("Script manually stopped. Data saved up to this point.")

# After all URLs are processed, write all data to Excel file
df_temp = pd.DataFrame(flattened_profiles)

# Read the existing data
df_existing = pd.read_excel(output_file)

# Append the new data
df_final = pd.concat([df_existing, df_temp], ignore_index=True)

# Save the combined data
df_final.to_excel(output_file, index=False)

# Save the extracted data to a JSON file
with open("xxxxxxxxxxxx", 'w', encoding='utf-8') as json_file:
    json.dump(profiles_data, json_file, ensure_ascii=False, indent=4)

while True:
    time.sleep(10)`
tomquirk / linkedin-api

Profile's interest scraping #345