ponty / PyVirtualDisplay

Python wrapper for Xvfb, Xephyr and Xvnc
BSD 2-Clause "Simplified" License
697 stars 78 forks source link

FileNotFoundError: [WinError 2] The system cannot find the file specified #94

Closed rafliogun49 closed 5 months ago

rafliogun49 commented 5 months ago

i want to scrap google scholar profile in jupyter notebook. how It doesn't works?

# Scrapping

from selenium import webdriver
# import chromedriver_autoinstaller_fix
# import pyvirtualdisplay
from pyvirtualdisplay import Display

from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait

csv_file = pd.read_csv("https://raw.githubusercontent.com/pkr-br/scholar-scraping/main/list_peneliti.csv")
list_peneliti = csv_file
list_peneliti = list_peneliti[list_peneliti['link_scholar'].notna()]
added_link = "&view_op=list_works&sortby=pubdate"
list_peneliti["updated_link_scholar"] = [i+added_link for i in list_peneliti['link_scholar']]

# # chromedriver_autoinstaller_fix.install()
# display = pyvirtualdisplay.Display()
# display.start()
disp = Display()
disp.start()
driver = webdriver.Chrome()

all_publications = {}
for index, row in list_peneliti.iterrows():
    nama_peneliti = row["nama"]
    URL = row["updated_link_scholar"]
    driver.get(URL)
    driver.implicitly_wait(2)

    for i in range(5):
        btn = driver.find_elements(By.XPATH , '//button[@id="gsc_bpf_more"]')[0]
        disabled = btn.get_attribute("disabled")
        if not disabled:
            btn.click()
            time.sleep(3)
            print("click " + str(i + 1))
        elif disabled:
            break
    print("click finish " + nama_peneliti)

#     Extract h-index
    index_table = driver.find_element(By.ID, "gsc_rsb_st")
    h_index_row = index_table.find_elements(By.TAG_NAME, "tr")[2]
    h_index = h_index_row.find_element(By.CLASS_NAME, "gsc_rsb_std").text

#     Extract specialities
    specialities = driver.find_elements(By.CLASS_NAME, "gsc_prf_inta.gs_ibl")
    # Check if specialities exist
    if specialities:
        # If specialities exist, extract text from each element and store it in a list
        speciality_list = [element.text for element in specialities]
    else:
        speciality_list = []

    publications = driver.find_elements(By.CLASS_NAME, "gsc_a_tr")
    publications_list = []
    for publication in publications:
        data_list = {}
        title = publication.find_element(By.CLASS_NAME, "gsc_a_at").text
        data_list["title"] = str(title)
        link = publication.find_element(By.CLASS_NAME, "gsc_a_at").get_attribute("href")
        data_list['link'] = str(link)
        year = publication.find_element(By.CLASS_NAME, "gsc_a_y").text
        if(year != ""):
            data_list["year"] = int(year)
        citate = publication.find_element(By.CLASS_NAME, "gsc_a_c").text
        if "\n*" in citate:
            citate = citate.replace("\n*", "")
        if(citate != ""):
            data_list["cited by"] = int(citate)
        else :
            data_list["cited by"] = 0
        authors = publication.find_element(By.CLASS_NAME, "gs_gray").text
        journal = publication.find_elements(By.CLASS_NAME, "gs_gray")[1].text
        if(journal == ""):
            journal = "-"
        data_list["journal"] = str(journal)
        data_list["authors"] = str(authors)

        publications_list.append(data_list)
    all_publications[nama_peneliti] = {
        "name": nama_peneliti,
        "publications": publications_list,
        "specialities": speciality_list,
        "h_index": h_index
    }
    print("peneliti " + str(index + 1)+" "+ nama_peneliti + " selesai")

# Specify the file path where you want to save the JSON file
file_path = 'data_publications.json'

# Write the dictionary data to a JSON file
with open(file_path, 'w', encoding='utf-8') as json_file:
    json.dump(all_publications, json_file, indent=2, ensure_ascii=False)

driver.close()
disp.stop()