woneuy01 / R-visualization

0 stars 0 forks source link

Data Visualization #8

Open woneuy01 opened 4 years ago

woneuy01 commented 4 years ago

Code

dot plot showing the data

heights %>% ggplot(aes(sex, height)) + geom_point()

jittered, alpha blended point plot

heights %>% ggplot(aes(sex, height)) + geom_jitter(width = 0.1, alpha = 0.2)

woneuy01 commented 4 years ago

https://rafalab.github.io/dsbook/data-visualization-principles.html#consider-transformations

woneuy01 commented 4 years ago

color_blind_friendly_cols <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") p1 <- data.frame(x = 1:8, y = 1:8, col = as.character(1:8)) %>% ggplot(aes(x, y, color = col)) + geom_point(size = 5) p1 + scale_color_manual(values = color_blind_friendly_cols)

woneuy01 commented 4 years ago

library(tidyverse) library(dslabs) data(gapminder) west <- c("Western Europe", "Northern Europe", "Southern Europe", "Northern America", "Australia and New Zealand") dat <- gapminder %>% filter(year %in% c(2010, 2015) & region %in% west & !is.na(life_expectancy) & population > 10^7) dat %>% mutate(location = ifelse(year == 2010, 1, 2), location = ifelse(year == 2015 & country %in% c("United Kingdom", "Portugal"), location + 0.22, location), hjust = ifelse(year == 2010, 1, 0)) %>% mutate(year = as.factor(year)) %>% ggplot(aes(year, life_expectancy, group = country)) + geom_line(aes(color = country), show.legend = FALSE) + geom_text(aes(x = location, label = country, hjust = hjust), show.legend = FALSE) + xlab("") + ylab("Life Expectancy")

woneuy01 commented 4 years ago

Code: Bland-Altman plot library(ggrepel) dat %>% mutate(year = paste0("lifeexpectancy", year)) %>% select(country, year, life_expectancy) %>% spread(year, life_expectancy) %>% mutate(average = (life_expectancy_2015 + life_expectancy_2010)/2, difference = life_expectancy_2015 - life_expectancy_2010) %>% ggplot(aes(average, difference, label = country)) + geom_point() + geom_text_repel() + geom_abline(lty = 2) + xlab("Average of 2010 and 2015") + ylab("Difference between 2015 and 2010") Rplot44

woneuy01 commented 4 years ago

Code: Tile plot of measles rate by year and state

import data and inspect

library(tidyverse) library(dslabs) data(us_contagious_diseases) str(us_contagious_diseases)

assign dat to the per 10,000 rate of measles, removing Alaska and Hawaii and adjusting for weeks reporting

the_disease <- "Measles" dat <- us_contagious_diseases %>% filter(!state %in% c("Hawaii", "Alaska") & disease == the_disease) %>% mutate(rate = count / population 10000 52/weeks_reporting) %>% mutate(state = reorder(state, rate))

plot disease rates per year in California

dat %>% filter(state == "California" & !is.na(rate)) %>% ggplot(aes(year, rate)) + geom_line() + ylab("Cases per 10,000") + geom_vline(xintercept=1963, col = "blue")

Rplot2

tile plot of disease rate by state and year

dat %>% ggplot(aes(year, state, fill=rate)) + geom_tile(color = "grey50") + scale_x_continuous(expand = c(0,0)) + scale_fill_gradientn(colors = RColorBrewer::brewer.pal(9, "Reds"), trans = "sqrt") + geom_vline(xintercept = 1963, col = "blue") + theme_minimal() + theme(panel.grid = element_blank()) + ggtitle(the_disease) + ylab("") + xlab("") Rplot3

woneuy01 commented 4 years ago

Code: Line plot of measles rate by year and state

compute US average measles rate by year

avg <- us_contagious_diseases %>% filter(disease == the_disease) %>% group_by(year) %>% summarize(us_rate = sum(count, na.rm = TRUE)/sum(population, na.rm = TRUE)*10000)

make line plot of measles rate by year by state

dat %>% filter(!is.na(rate)) %>% ggplot() + geom_line(aes(year, rate, group = state), color = "grey50", show.legend = FALSE, alpha = 0.2, size = 1) + geom_line(mapping = aes(year, us_rate), data = avg, size = 1, col = "black") + scale_y_continuous(trans = "sqrt", breaks = c(5, 25, 125, 300)) + ggtitle("Cases per 10,000 by state") + xlab("") + ylab("") + geom_text(data = data.frame(x = 1955, y = 50), mapping = aes(x, y, label = "US average"), color = "black") + geom_vline(xintercept = 1963, col = "blue") Rplot44

woneuy01 commented 4 years ago

library(dplyr) library(ggplot2) library(dslabs) dat <- us_contagious_diseases %>% filter(year == 1967 & disease=="Measles" & !is.na(population)) %>% mutate(rate = count / population 10000 52 / weeks_reporting) %>% mutate(state= reorder( state, rate)) state <- dat$state rate <- dat$count/(dat$population/10000)*(52/dat$weeks_reporting) print(levels(dat$state))

[1] "Georgia" "District Of Columbia" "Connecticut"
[4] "Minnesota" "Louisiana" "New Hampshire"
[7] "Maryland" "Kansas" "New York"
[10] "Pennsylvania" "Rhode Island" "Massachusetts"
[13] "Missouri" "New Jersey" "South Dakota"
[16] "Vermont" "Delaware" "Ohio"
[19] "Illinois" "Michigan" "Indiana"
[22] "North Carolina" "South Carolina" "Hawaii"
[25] "Maine" "California" "Florida"
[28] "Iowa" "Mississippi" "Oklahoma"
[31] "Nebraska" "Utah" "Alabama"
[34] "Kentucky" "Wisconsin" "Montana"
[37] "Virginia" "Alaska" "Tennessee"
[40] "Idaho" "New Mexico" "Arizona"
[43] "Nevada" "Arkansas" "Wyoming"
[46] "Colorado" "West Virginia" "Oregon"
[49] "Texas" "North Dakota" "Washington"

woneuy01 commented 4 years ago

ibrary(dplyr) library(ggplot2) library(dslabs) data(us_contagious_diseases) dat <- us_contagious_diseases %>% filter(year == 1967 & disease=="Measles" & count>0 & !is.na(population)) %>% mutate(rate = count / population 10000 52 / weeks_reporting) %>% mutate(state = reorder(state, rate))

dat %>% ggplot(aes(state, rate)) + geom_bar(stat="identity") + coord_flip() Rplotdf

woneuy01 commented 4 years ago

library(dplyr) library(ggplot2) library(dslabs) data("murders") murders %>% mutate(rate = total/population*100000) %>% mutate(region = reorder(region,rate,FUN=median)) %>% ggplot(aes(region,rate))+ geom_boxplot(coef=3)+ geom_point() dfasd

woneuy01 commented 4 years ago

library(dplyr) library(ggplot2) library(RColorBrewer) library(dslabs) data(us_contagious_diseases)

the_disease = "Smallpox" dat <- us_contagious_diseases %>% filter(!state%in%c("Hawaii","Alaska") & disease == the_disease & weeks_reporting >= 10) %>% mutate(rate = count / population * 10000) %>% mutate(state = reorder(state, rate))

dat %>% ggplot(aes(year, state, fill = rate)) + geom_tile(color = "grey50") + scale_x_continuous(expand=c(0,0)) + scale_fill_gradientn(colors = brewer.pal(9, "Reds"), trans = "sqrt") + theme_minimal() + theme(panel.grid = element_blank()) + ggtitle(the_disease) + ylab("") + xlab("") er

woneuy01 commented 4 years ago

library(dplyr) library(ggplot2) library(dslabs) library(RColorBrewer) data(us_contagious_diseases)

the_disease = "Smallpox" dat <- us_contagious_diseases %>% filter(!state%in%c("Hawaii","Alaska") & disease == the_disease & weeks_reporting >= 10) %>% mutate(rate = count / population * 10000) %>% mutate(state = reorder(state, rate))

avg <- us_contagious_diseases %>% filter(disease==the_disease) %>% group_by(year) %>% summarize(us_rate = sum(count, na.rm=TRUE)/sum(population, na.rm=TRUE)*10000)

A tibble: 25 x 2 year us_rate

1 1928 3.05 2 1929 3.16 3 1930 3.72 4 1931 2.32 5 1932 0.859 6 1933 0.489 7 1934 0.405 8 1935 0.587 9 1936 0.566 10 1937 0.860 # … with 15 more rows

dat %>% ggplot() + geom_line(aes(year, rate, group = state), color = "grey50", show.legend = FALSE, alpha = 0.2, size = 1) + geom_line(mapping = aes(year, us_rate), data = avg, size = 1, color = "black") + scale_y_continuous(trans = "sqrt", breaks = c(5,25,125,300)) + ggtitle("Cases per 10,000 by state") + xlab("") + ylab("") + geom_text(data = data.frame(x=1955, y=50), mapping = aes(x, y, label="US average"), color="black") + geom_vline(xintercept=1963, col = "blue") Rplot1

woneuy01 commented 4 years ago

library(dplyr) library(ggplot2) library(dslabs) library(RColorBrewer) data(us_contagious_diseases)

us_contagious_diseases %>% filter(state=="California" & weeks_reporting >= 10) %>% group_by(year, disease) %>% summarize(rate = sum(count)/sum(population)*10000) %>% ggplot(aes(year, rate, color = disease )) + geom_line() Rplot2

woneuy01 commented 4 years ago

library(dplyr) library(ggplot2) library(dslabs) library(RColorBrewer) data(us_contagious_diseases) library(dplyr) library(ggplot2) library(dslabs) library(RColorBrewer) data(us_contagious_diseases)

us_contagious_diseases %>% filter(! is.na(population)) %>% group_by(year, disease) %>% summarize(rate = sum(count)/sum(population)*10000) %>% ggplot(aes(year, rate, color = disease )) + geom_line() Rplot3

woneuy01 commented 4 years ago

Libraries, Options, and Data Define the titanic dataset starting from the titanic library with the following code:

options(digits = 3) # report 3 significant digits library(tidyverse) library(titanic) titanic <- titanic_train %>% select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare) %>% mutate(Survived = factor(Survived), Pclass = factor(Pclass), Sex = factor(Sex))

woneuy01 commented 4 years ago

install.packages("dplyr") install.packages("dslabs") install.packages("ggplot2") require(dplyr) require(dslabs) require(ggplot2) data(temp_carbon) data(greenhouse_gases) data(historic_co2)

greenhouse_gases %>% ggplot(aes(year, concentration)) + geom_line() + facet_grid(rows = vars(gas), scales = "free") + geom_vline(xintercept =1850)+ ylab("Concentration (ch4/n2o ppb, co2 ppm)") + ggtitle("Atmospheric greenhouse gas concentration by year, 0-2000")

1

temp_carbon %>% ggplot(aes(year, carbon_emissions))+ geom_line()

co2_time <-historic_co2 %>% ggplot(aes(year,co2, color = source))+ geom_line()

co2_time + scale_x_continuous( limit = c(-3000,2018))