Closed rafliogun49 closed 5 months ago
i want to scrap google scholar profile in jupyter notebook. how It doesn't works?
# Scrapping from selenium import webdriver # import chromedriver_autoinstaller_fix # import pyvirtualdisplay from pyvirtualdisplay import Display from selenium.webdriver.common.keys import Keys import pandas as pd from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select from selenium.webdriver.support.ui import WebDriverWait csv_file = pd.read_csv("https://raw.githubusercontent.com/pkr-br/scholar-scraping/main/list_peneliti.csv") list_peneliti = csv_file list_peneliti = list_peneliti[list_peneliti['link_scholar'].notna()] added_link = "&view_op=list_works&sortby=pubdate" list_peneliti["updated_link_scholar"] = [i+added_link for i in list_peneliti['link_scholar']] # # chromedriver_autoinstaller_fix.install() # display = pyvirtualdisplay.Display() # display.start() disp = Display() disp.start() driver = webdriver.Chrome() all_publications = {} for index, row in list_peneliti.iterrows(): nama_peneliti = row["nama"] URL = row["updated_link_scholar"] driver.get(URL) driver.implicitly_wait(2) for i in range(5): btn = driver.find_elements(By.XPATH , '//button[@id="gsc_bpf_more"]')[0] disabled = btn.get_attribute("disabled") if not disabled: btn.click() time.sleep(3) print("click " + str(i + 1)) elif disabled: break print("click finish " + nama_peneliti) # Extract h-index index_table = driver.find_element(By.ID, "gsc_rsb_st") h_index_row = index_table.find_elements(By.TAG_NAME, "tr")[2] h_index = h_index_row.find_element(By.CLASS_NAME, "gsc_rsb_std").text # Extract specialities specialities = driver.find_elements(By.CLASS_NAME, "gsc_prf_inta.gs_ibl") # Check if specialities exist if specialities: # If specialities exist, extract text from each element and store it in a list speciality_list = [element.text for element in specialities] else: speciality_list = [] publications = driver.find_elements(By.CLASS_NAME, "gsc_a_tr") publications_list = [] for publication in publications: data_list = {} title = publication.find_element(By.CLASS_NAME, "gsc_a_at").text data_list["title"] = str(title) link = publication.find_element(By.CLASS_NAME, "gsc_a_at").get_attribute("href") data_list['link'] = str(link) year = publication.find_element(By.CLASS_NAME, "gsc_a_y").text if(year != ""): data_list["year"] = int(year) citate = publication.find_element(By.CLASS_NAME, "gsc_a_c").text if "\n*" in citate: citate = citate.replace("\n*", "") if(citate != ""): data_list["cited by"] = int(citate) else : data_list["cited by"] = 0 authors = publication.find_element(By.CLASS_NAME, "gs_gray").text journal = publication.find_elements(By.CLASS_NAME, "gs_gray")[1].text if(journal == ""): journal = "-" data_list["journal"] = str(journal) data_list["authors"] = str(authors) publications_list.append(data_list) all_publications[nama_peneliti] = { "name": nama_peneliti, "publications": publications_list, "specialities": speciality_list, "h_index": h_index } print("peneliti " + str(index + 1)+" "+ nama_peneliti + " selesai") # Specify the file path where you want to save the JSON file file_path = 'data_publications.json' # Write the dictionary data to a JSON file with open(file_path, 'w', encoding='utf-8') as json_file: json.dump(all_publications, json_file, indent=2, ensure_ascii=False) driver.close() disp.stop()
i want to scrap google scholar profile in jupyter notebook. how It doesn't works?