Open charthee opened 3 years ago
Glad to see this repo getting some use! Could you please provide the URL you are scraping so that I can reproduce the issue?
Also, it looks like the scraping worked fine, but the subsequent data manipulation failed, i.e. the part that cleans the dates. If possible, could you also share a sample of the data that reproduces the issue? Should be able to paste the output of dput(yourdatahere)
Thank you so much for replying!
I just bumped into a new problem... here's what I'm seeing:
source("R/scrape.R") Loading libraries... Sourcing functions... google_url <- "https://www.glassdoor.com/Reviews/Google-Reviews-E9079" pages <- 1:5 out <- lapply(pages, function(page) {
- Sys.sleep(1)
- try_scrape_reviews(google_url, page)
- }) Scraping page [1] at [2020-10-21 23:05:20] Scraping page [2] at [2020-10-21 23:05:28] Scraping page [3] at [2020-10-21 23:05:37] Scraping page [4] at [2020-10-21 23:05:45] Scraping page [5] at [2020-10-21 23:05:52] Warning messages: 1: Failed to parse page [1] 2: Failed to parse page [2] 3: Failed to parse page [3] 4: Failed to parse page [4] 5: Failed to parse page [5]
Funny it worked perfectly yesterday (managed to scrape 200 pages!) but doesn't work as well today.. :(
So it seems that a lot of the xpaths have changed, so they really just need to be modified slightly. This can be done by right clicking the part you're interested in and inspecting the element, then copying the xpath. For example:
library(rvest)
library(xml2)
library(dplyr, warn.conflicts = FALSE)
library(glue)
library(stringr)
read_page <- function(url, page) {
glue("{url}_P{page}.htm") %>%
read_html()
}
get_review_ids <- function(.data) {
.data %>%
html_nodes(xpath = "//*[contains(@id, 'empReview')]") %>%
html_attr("id")
}
url <- "https://www.glassdoor.com/Reviews/Google-Reviews-E9079"
page <- read_page(url, 1)
review_ids <- get_review_ids(page)
get_review_title <- function(.data, review_id) {
# this is the old one
# x <- glue('//*[@id="{review_id}"]/div/div[2]/div[2]/h2/a')
# this is the new one
x <- glue('//*[@id="{review_id}"]/div/div[2]/div[2]/div[1]/h2/a')
.data %>%
html_nodes(xpath = x) %>%
html_text() %>%
str_remove_all(., '"')
}
get_review_title(page, review_ids[1])
#> [1] "Terrific research culture and benefits"
I probably won't resolve this right now because it will more than likely change a couple months from now. I would rather make an update that is more robust and less likely to break for a longer period of time.
Hi! Thank you so much for your awesome code! I need to scrape ratings and subratings for my thesis and I'm so happy to have encountered it :). Unfortunately, I haven't entirely been able to make it work. I think the Glassdoor website changed quite a bit and I also don't have a lot of experience coding... I already updated all the Xpaths of the data I need. This is the code I have been trying:
message("Loading libraries...") suppressPackageStartupMessages({ library(rvest) library(xml2) library(dplyr) library(lubridate) library(glue) library(stringr) library(tidyr) library(janitor) })
message("Sourcing functions...") read_page <- function(url, page) { glue("{url}_P{page}.htm") %>% read_html() }
get_review_ids <- function(.data) { .data %>% html_nodes(xpath = "//*[contains(@id, 'empReview')]") %>% html_attr("id") }
get_employeer_pros <- function(.data, review_id) { x <- glue('//*[@id="{review_id}"]/div/div/div[2]/div/div[2]/div[1]/p[2]/span') .data %>% html_nodes(xpath = x) %>% html_text() }
get_employeer_cons <- function(.data, review_id) { x <- glue('//*[@id="{review_id}"]/div/div/div[2]/div/div[2]/div[2]/p[2]/span') .data %>% html_nodes(xpath = x) %>% html_text() }
get_overall_rating <- function(.data, review_id) { x <- glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/div/span[1]') .data %>% html_nodes(xpath = x) %>% html_text() %>% as.numeric() }
get_sub_ratings <- function(.data, review_id) { out <- lapply(1:6, function(x) { subcategory <- .data %>% html_nodes(xpath = glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/aside/div/div/ul/li[{x}]/div[1]')) %>% html_text()
rating <- .data %>%
html_nodes(xpath = glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/aside/div/div/ul/li[{x}]/div[2]')) %>%
html_attr("title") %>%
as.numeric()
tibble(subcategory = subcategory, rating = rating)
})
no_sub_ratings <- sum(unlist(Map(nrow, out))) == 0 if (no_sub_ratings) { tibble( "work_life_balance" = NAreal, "culture_values" = NAreal, "career_opportunities" = NAreal, "compensation_and_benefits" = NAreal, "senior_management" = NAreal ) } else { out %>% bind_rows() %>% pivot_wider( names_from = subcategory, values_from = rating ) %>% clean_names("snake") } }
scrape_reviews <- function(url, page_number) { message("Scraping page [", page_number, "] at [", Sys.time(), "]") page <- read_page(url, page_number) review_ids <- get_review_ids(page)
employeer_pros <- unlist(lapply(review_ids, get_employeer_pros, .data = page)) employeer_cons <- unlist(lapply(review_ids, get_employeer_cons, .data = page)) employeer_rating <- unlist(lapply(review_ids, get_overall_rating, .data = page)) subcategories <- bind_rows(lapply(review_ids, function(x) { get_sub_ratings(page, x) }))
bind_cols(tibble( review_id = review_ids, employeer_pros = employeer_pros, employeer_cons = employeer_cons, employeer_rating = employeer_rating ), subcategories) }
try_scrape_reviews <- function(url, page) { tryCatch({ scrape_reviews( url = url, page = page ) }, error = function(e) { warning("Failed to parse page [", page, "]", call. = FALSE) NULL }) }
tesla_url <- "https://www.glassdoor.com/Reviews/Tesla-Reviews-E43129" apple_url <- "https://www.glassdoor.com/Reviews/Apple-Reviews-E1138" google_url <- "https://www.glassdoor.com/Reviews/Google-Reviews-E9079"
pages <- 1:5 out <- lapply(pages, function(page) { Sys.sleep(1) try_scrape_reviews(google_url, page) })
reviews <- bind_rows(Filter(Negate(is.null), out), .id = "page")
reviews %>% distinct() %>% mutate( review_time = clean_review_datetime(review_time_raw), page = as.numeric(page) ) %>% select( page, review_id, employeer_pros, employeer_cons, employeer_rating, work_life_balance, culture_values, career_opportunities, compensation_and_benefits, senior_management ) %>% glimpse()
The main problem are the subratings. These are shown as little stars and I can't seem to figure out how to scrape them as ratings. Also, I'm getting this error message for the last command:
Would it be possible to help me out a little bit? :) I've been trying to get this right for a couple of days already.. Thanks so much in advance!
Hi @insephe could you reformat the code you provide above so that it can easily be copied and pasted?
You will want to use triple backticks, see https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#quoting-code
Sure! (sorry, I don't have a lot of experience :) ). Glassdoor also included a new subrating category 'diversity&inclusion', so I tried to already take this into account in the modified code below. I think the main problem is the subrating part. Thanks for responding so quickly!
message("Loading libraries...")
suppressPackageStartupMessages({
library(rvest)
library(xml2)
library(dplyr)
library(lubridate)
library(glue)
library(stringr)
library(tidyr)
library(janitor)
})
message("Sourcing functions...")
read_page <- function(url, page) {
glue("{url}_P{page}.htm") %>%
read_html()
}
get_review_ids <- function(.data) {
.data %>%
html_nodes(xpath = "//*[contains(@id, 'empReview')]") %>%
html_attr("id")
}
get_employeer_pros <- function(.data, review_id) {
x <- glue('//*[@id="{review_id}"]/div/div/div[2]/div/div[2]/div[1]/p[2]/span')
.data %>%
html_nodes(xpath = x) %>%
html_text()
}
get_employeer_cons <- function(.data, review_id) {
x <- glue('//*[@id="{review_id}"]/div/div/div[2]/div/div[2]/div[2]/p[2]/span')
.data %>%
html_nodes(xpath = x) %>%
html_text()
}
get_overall_rating <- function(.data, review_id) {
x <- glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/div/span[1]')
.data %>%
html_nodes(xpath = x) %>%
html_text() %>%
as.numeric()
}
get_sub_ratings <- function(.data, review_id) {
out <- lapply(1:6, function(x) {
subcategory <- .data %>%
html_nodes(xpath = glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/aside/div/div/ul/li[{x}]/div[1]')) %>%
html_text()
rating <- .data %>%
html_nodes(xpath = glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/aside/div/div/ul/li[{x}]/div[2]')) %>%
html_attr(tooptipContainer) %>%
as.numeric()
tibble(subcategory = subcategory, rating = rating)
})
no_sub_ratings <- sum(unlist(Map(nrow, out))) == 0
if (no_sub_ratings) {
tibble(
"work_life_balance" = NA_real_,
"culture_values" = NA_real_,
"diversity_inclusion"=NA_real_,
"career_opportunities" = NA_real_,
"compensation_and_benefits" = NA_real_,
"senior_management" = NA_real_
)
} else {
out %>%
bind_rows() %>%
pivot_wider(
names_from = subcategory,
values_from = rating
) %>%
clean_names("snake")
}
}
scrape_reviews <- function(url, page_number) {
message("Scraping page [", page_number, "] at [", Sys.time(), "]")
page <- read_page(url, page_number)
review_ids <- get_review_ids(page)
#review_time <- unlist(lapply(review_ids, get_review_datetime, .data = page))
#review_title <- unlist(lapply(review_ids, get_review_title, .data = page))
#employee_role <- unlist(lapply(review_ids, get_employee_role, .data = page))
#employee_history <- unlist(lapply(review_ids, get_employee_history, .data = page))
employeer_pros <- unlist(lapply(review_ids, get_employeer_pros, .data = page))
employeer_cons <- unlist(lapply(review_ids, get_employeer_cons, .data = page))
employeer_rating <- unlist(lapply(review_ids, get_overall_rating, .data = page))
subcategories <- bind_rows(lapply(review_ids, function(x) {
get_sub_ratings(page, x)
}))
bind_cols(tibble(
review_id = review_ids,
employeer_pros = employeer_pros,
employeer_cons = employeer_cons,
employeer_rating = employeer_rating
), subcategories)
}
try_scrape_reviews <- function(url, page) {
tryCatch({
scrape_reviews(
url = url,
page = page
)
}, error = function(e) {
warning("Failed to parse page [", page, "]", call. = FALSE)
NULL
})
}
#____________________________
tesla_url <- "https://www.glassdoor.com/Reviews/Tesla-Reviews-E43129"
apple_url <- "https://www.glassdoor.com/Reviews/Apple-Reviews-E1138"
google_url <- "https://www.glassdoor.com/Reviews/Google-Reviews-E9079"
pages <- 1:5
out <- lapply(pages, function(page) {
Sys.sleep(1)
try_scrape_reviews(google_url, page)
})
# filter for stuff we successfully extracted
reviews <- bind_rows(Filter(Negate(is.null), out), .id = "page")
# remove any duplicates, parse the review time
reviews %>%
distinct() %>%
mutate(
review_time = clean_review_datetime(review_time_raw),
page = as.numeric(page)
) %>%
select(
page,
review_id,
employeer_pros,
employeer_cons,
employeer_rating,
work_life_balance,
culture_values,
career_opportunities,
compensation_and_benefits,
senior_management
) %>%
glimpse()
Thanks, I noticed you commented out a couple of the functions, stuff like get_review_datetime
which is fine because it looks like the xpaths for these requires updates anyway.
If you just need the employer pros/cons, this is currently working:
message("Loading libraries...")
suppressPackageStartupMessages({
library(rvest)
library(xml2)
library(dplyr)
library(lubridate)
library(glue)
library(stringr)
library(tidyr)
library(janitor)
})
message("Sourcing functions...")
read_page <- function(url, page) {
glue("{url}_P{page}.htm") %>%
read_html()
}
get_review_ids <- function(.data) {
.data %>%
html_nodes(xpath = "//*[contains(@id, 'empReview')]") %>%
html_attr("id")
}
get_employeer_pros <- function(.data, review_id) {
x <- glue('//*[@id="{review_id}"]/div/div/div[2]/div/div[2]/div[1]/p[2]/span')
.data %>%
html_nodes(xpath = x) %>%
html_text()
}
get_employeer_cons <- function(.data, review_id) {
x <- glue('//*[@id="{review_id}"]/div/div/div[2]/div/div[2]/div[2]/p[2]/span')
.data %>%
html_nodes(xpath = x) %>%
html_text()
}
get_overall_rating <- function(.data, review_id) {
x <- glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/div/span[1]')
.data %>%
html_nodes(xpath = x) %>%
html_text() %>%
as.numeric()
}
get_sub_ratings <- function(.data, review_id) {
out <- lapply(1:6, function(x) {
subcategory <- .data %>%
html_nodes(xpath = glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/aside/div/div/ul/li[{x}]/div[1]')) %>%
html_text()
rating <- .data %>%
html_nodes(xpath = glue('//*[@id="{review_id}"]/div/div/div[1]/div/div/aside/div/div/ul/li[{x}]/div[2]')) %>%
html_attr(tooptipContainer) %>%
as.numeric()
tibble(subcategory = subcategory, rating = rating)
})
no_sub_ratings <- sum(unlist(Map(nrow, out))) == 0
if (no_sub_ratings) {
tibble(
"work_life_balance" = NA_real_,
"culture_values" = NA_real_,
"diversity_inclusion"=NA_real_,
"career_opportunities" = NA_real_,
"compensation_and_benefits" = NA_real_,
"senior_management" = NA_real_
)
} else {
out %>%
bind_rows() %>%
pivot_wider(
names_from = subcategory,
values_from = rating
) %>%
clean_names("snake")
}
}
scrape_reviews <- function(url, page_number) {
message("Scraping page [", page_number, "] at [", Sys.time(), "]")
page <- read_page(url, page_number)
review_ids <- get_review_ids(page)
employeer_pros <- unlist(lapply(review_ids, get_employeer_pros, .data = page))
employeer_cons <- unlist(lapply(review_ids, get_employeer_cons, .data = page))
employeer_rating <- unlist(lapply(review_ids, get_overall_rating, .data = page))
bind_cols(tibble(
review_id = review_ids,
employeer_pros = employeer_pros,
employeer_cons = employeer_cons,
employeer_rating = employeer_rating
))
}
try_scrape_reviews <- function(url, page) {
tryCatch({
scrape_reviews(
url = url,
page = page
)
}, error = function(e) {
warning("Failed to parse page [", page, "]", call. = FALSE)
NULL
})
}
google_url <- "https://www.glassdoor.com/Reviews/Google-Reviews-E9079"
pages <- 1:5
out <- lapply(pages, function(page) {
Sys.sleep(1)
try_scrape_reviews(google_url, page)
})
bind_rows(Filter(Negate(is.null), out), .id = "page")
Hi there! Firstly, THANK YOU so much for the code, super helpful!!!
While I managed to scrap 200 pages, I got this error message:
Error: Problem with
mutate()
inputreview_time
. x object 'review_time_raw' not found i Inputreview_time
isclean_review_datetime(review_time_raw)
. Runrlang::last_error()
to see where the error occurred.after trying to run this code:
reviews <- bind_rows(Filter(Negate(is.null), out), .id = "page") reviews %>% distinct() %>% mutate( review_time = clean_review_datetime(review_time_raw), page = as.numeric(page) ) %>% select( page, review_id, review_time_raw, review_time, review_title, employee_role, employee_history, employeer_pros, employeer_cons, employeer_rating, work_life_balance, culture_values, career_opportunities, compensation_and_benefits, senior_management ) %>% glimpse()
Could you please help figure out what went wrong? Really appreciate it.