Open woneuy01 opened 4 years ago
The dot operator allows you to access values stored in data that is being piped in using the %>% character. The dot is a placeholder for the data being passed in through the pipe. The dot operator allows dplyr functions to return single vectors or numbers instead of only data frames. us_murder_rate %>% .$rate is equivalent to us_murder_rate$rate.
library(tidyverse) library(dslabs) data(murders)
murders <- murders %>% mutate(murder_rate = total/population*100000) ( **here total is murder number by states) summarize(murders, mean(murder_rate))
us_murder_rate <- murders %>% summarize(rate = sum(total) / sum(population) * 100000) us_murder_rate
rate 1 3.034555
us_murder_rate %>% .$rate
[1] 3.034555
calculate and extract the murder rate with one pipe
us_murder_rate <- murders %>% summarize(rate = sum(total) / sum(population * 100000) %>% .$rate
Note that an equivalent way to extract a single column using the pipe is us_murder_rate %>% pull(rate).
summarize and some other dplyr functions will behave differently on grouped data frames. Using summarize on a grouped data frame computes the summary statistics for each of the separate groups.
Code
library(tidyverse) library(dslabs) data(heights) data(murders)
heights %>% group_by(sex) %>% summarize(average = mean(height), standard_deviation = sd(height))
A tibble: 2 x 3 sex average standard_deviation
1 Female 64.9 3.76 2 Male 69.3 3.61
murders <- murders %>% mutate(murder_rate = total/population * 100000) murders %>% group_by(region) %>% summarize(median_rate = median(murder_rate))
region median_rate
1 Northeast 1.80 2 South 3.40 3 North Central 1.97 4 West 1.29
library(tidyverse) library(dslabs) data(murders)
murders <- murders %>% mutate(murder_rate = total/population * 100000)
murders %>% arrange(population) %>% head()
murders %>% arrange(murder_rate) %>% head()
murders %>% arrange(desc(murder_rate)) %>% head()
murders %>% arrange(region, murder_rate) %>% head()
murders %>% top_n(10, murder_rate)
murders %>% arrange(desc(murder_rate)) %>% top_n(10)
mean(na_example, na.rm = TRUE) sd(na_example, na.rm = TRUE)
library(dplyr) library(NHANES) data(NHANES)
head(NHANES)
ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education MaritalStatus HHIncome
1 51624 2009_10 male 34 " 30-39" 409 White NA High Sch… Married 25000-3… 2 51624 2009_10 male 34 " 30-39" 409 White NA High Sch… Married 25000-3… 3 51624 2009_10 male 34 " 30-39" 409 White NA High Sch… Married 25000-3… 4 51625 2009_10 male 4 " 0-9" 49 Other NA NA NA 20000-2… 5 51630 2009_10 female 49 " 40-49" 596 White NA Some Col… LivePartner 35000-4… 6 51638 2009_10 male 9 " 0-9" 115 White NA NA NA 75000-9…
tab <- NHANES %>% filter(AgeDecade == " 20-29" & Gender == "female")
Use the summarize function after filtering for 20-29 year old females and connect the results using the pipe %>%. When doing this remember there are NAs in the data!
library(dplyr) library(NHANES) data(NHANES)
ref <- NHANES %>% filter(AgeDecade == " 20-29" & Gender == "female") %>% summarise ( average = mean(BPSysAve, na.rm = TRUE), standard_deiation=sd(BPSysAve ,na.rm = TRUE))
library(dplyr) library(NHANES) data(NHANES)
ref_avg <- NHANES %>% filter(AgeDecade == " 20-29" & Gender == "female") %>% summarize( ref_avg = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE)) %>% .$ref_avg
Use filter and summarize connected by the pipe %>% again. The functions min and max can be used to get the values you want. Within summarize, save the min and max of systolic blood pressure as minbp and maxbp
library(dplyr) library(NHANES) data(NHANES)
NHANES %>% filter(AgeDecade == " 20-29" & Gender == "female") %>% summarise ( minbp = min(BPSysAve, na.rm = TRUE), maxbp = max(BPSysAve, na.rm = TRUE))
Within summarize, save the average and standard deviation of systolic blood pressure (BPSysAve) as average and standard_deviation
library(dplyr) library(NHANES) data(NHANES)
NHANES %>% filter(Gender == "female") %>% group_by (AgeDecade) %>% summarise (average = mean(BPSysAve, na.rm=TRUE) , standard_deviation= sd(BPSysAve, na.rm=TRUE))
Note that we no longer have to filter!
library(NHANES) data(NHANES) NHANES %>% group_by(AgeDecade, Gender) %>% summarise( average = mean(BPSysAve, na.rm = TRUE), standard_deviation= sd(BPSysAve, na.rm=TRUE))
library(dplyr) library(NHANES) data(NHANES) NHANES %>% filter(Gender == "male" & AgeDecade==" 40-49") %>% group_by(Race1) %>% summarize(average = mean(BPSysAve, na.rm = TRUE), standard_deviation = sd(BPSysAve, na.rm=TRUE)) %>% arrange(average)
library(tidyverse) library(dslabs) data(heights)
compute average and standard deviation for males
s <- heights %>% filter(sex == "Male") %>% summarize(average = mean(height), standard_deviation = sd(height))
access average and standard deviation from summary table
s$average s$standard_deviation
compute median, min and max
heights %>% filter(sex == "Male") %>% summarize(median = median(height), minimum = min(height), maximum = max(height))
alternative way to get min, median, max in base R
quantile(heights$height, c(0, 0.5, 1))
generates an error: summarize can only take functions that return a single value
heights %>% filter(sex == "Male") %>% summarize(range = quantile(height, c(0, 0.5, 1)))