Open daramireh opened 2 years ago
library(tidyverse) install.packages("ggstance") library(ggstance) install.packages("lvplot") library(lvplot) install.packages("hexbin") library(modelr)
ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut))
diamonds %>% count(cut)
ggplot(data = diamonds) + geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
diamonds %>% count(cut_width(carat, 0.5))
smaller <- diamonds %>% filter(carat < 3)
ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.1)
ggplot(data = smaller, mapping = aes(x = carat, color = cut)) + geom_freqpoly(binwidth = 0.1)
ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_histogram(binwidth = 0.25)
ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5) + coord_cartesian(ylim = c(0, 50))
unusual <- diamonds %>% filter(y < 3 | y > 20) %>% arrange(y)
unusual
diamonds2 <- diamonds %>% mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point()
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point(na.rm = TRUE)
nycflights13::flights %>% mutate( cancelled = is.na(dep_time), sched_hour = sched_dep_time %/% 100, sched_min = sched_dep_time %% 100, sched_dep_time = sched_hour + sched_min / 60 ) %>% ggplot(mapping = aes(sched_dep_time)) + geom_freqpoly( mapping = aes(color = cancelled), binwidth = 1/4 )
ggplot(data = diamonds, mapping = aes(x = price)) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(diamonds) + geom_bar(mapping = aes(x = cut))
ggplot( data = diamonds, mapping = aes(x = price, y = ..density..) ) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_boxplot()
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()
ggplot(data = mpg) + geom_boxplot( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) )
ggplot(data = mpg) + geom_boxplot( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) ) + coord_flip()
ggplot(data = mpg) + geom_boxplot(mapping = aes(x = class,y = hwy))
ggplot(data = diamonds, aes(x = price, y = cut)) + geom_lv(mapping = aes(color = cut))
p <- ggplot(mpg, aes(class, hwy)) p + geom_lv(aes(fill=..LV..)) + scale_fill_brewer() p + geom_lv() + geom_jitter(width = 0.2) p + geom_lv(alpha=1, aes(fill=..LV..)) + scale_fill_lv()
d <- ggplot(diamonds, aes(cut, price))
d + geom_lv() d+geom_violin()
ggplot(data = mpg) + geom_violin( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) )
ggplot(data = diamonds) + geom_count(mapping = aes(x = cut, y = color))
diamonds %>% count(color, cut) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = n))
ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price))
ggplot(data = diamonds) + geom_point( mapping = aes(x = carat, y = price), alpha = 1 / 100 )
ggplot(data = smaller) + geom_bin2d(mapping = aes(x = carat, y = price))
ggplot(data = smaller) + geom_hex(mapping = aes(x = carat, y = price))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_number(carat, 20)))
ggplot(data = faithful) + geom_point(mapping = aes(x = eruptions, y = waiting))
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>% add_residuals(mod) %>% mutate(resid = exp(resid))
ggplot(data = diamonds2) + geom_point(mapping = aes(x = carat, y = resid))
ggplot(data = diamonds2) + geom_boxplot(mapping = aes(x = cut, y = resid))
ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_freqpoly(binwidth = 0.25)
ggplot(faithful, aes(eruptions)) + geom_freqpoly(binwidth = 0.25)
diamonds %>% count(cut, clarity) %>% ggplot(aes(clarity, cut, fill = n)) + geom_tile()
chapter 5 exploratory data analysis
library(tidyverse) install.packages("ggstance") library(ggstance) install.packages("lvplot") library(lvplot) install.packages("hexbin") library(modelr)
examine a categorical variable
ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut))
acount categorical variable
diamonds %>% count(cut)
examine a continuous variable
ggplot(data = diamonds) + geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
computing a continuous variable
diamonds %>% count(cut_width(carat, 0.5))
testing diferent kinds of binwidth
smaller <- diamonds %>% filter(carat < 3)
ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.1)
overlay multiples histogram on one plot
ggplot(data = smaller, mapping = aes(x = carat, color = cut)) + geom_freqpoly(binwidth = 0.1)
e.g of clustering
ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_histogram(binwidth = 0.25)
Making zoom on y axis
ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5) + coord_cartesian(ylim = c(0, 50))
with the information obtained on previous
filter the values in 0, ~30 and ~60
unusual <- diamonds %>% filter(y < 3 | y > 20) %>% arrange(y)
unusual
replace the unusual values
diamonds2 <- diamonds %>% mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point()
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point(na.rm = TRUE)
nycflights13::flights %>% mutate( cancelled = is.na(dep_time), sched_hour = sched_dep_time %/% 100, sched_min = sched_dep_time %% 100, sched_dep_time = sched_hour + sched_min / 60 ) %>% ggplot(mapping = aes(sched_dep_time)) + geom_freqpoly( mapping = aes(color = cancelled), binwidth = 1/4 )
The default appearance of geom_freqpoly() is
not that useful for that sort of comparison because the height is
given by the count. That means if one of the groups is much smaller
than the others, it’s hard to see the differences in shape. For example,
let’s explore how the price of a diamond varies with its quality:
ggplot(data = diamonds, mapping = aes(x = price)) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(diamonds) + geom_bar(mapping = aes(x = cut))
ggplot( data = diamonds, mapping = aes(x = price, y = ..density..) ) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_boxplot()
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()
reordering
ggplot(data = mpg) + geom_boxplot( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) )
turn
ggplot(data = mpg) + geom_boxplot( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) ) + coord_flip()
ggplot(data = mpg) + geom_boxplot(mapping = aes(x = class,y = hwy))
exercises
ggplot(data = diamonds, aes(x = price, y = cut)) + geom_lv(mapping = aes(color = cut))
p <- ggplot(mpg, aes(class, hwy)) p + geom_lv(aes(fill=..LV..)) + scale_fill_brewer() p + geom_lv() + geom_jitter(width = 0.2) p + geom_lv(alpha=1, aes(fill=..LV..)) + scale_fill_lv()
d <- ggplot(diamonds, aes(cut, price))
d + geom_lv() d+geom_violin()
ggplot(data = mpg) + geom_violin( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) )
two categorical variables
ggplot(data = diamonds) + geom_count(mapping = aes(x = cut, y = color))
diamonds %>% count(color, cut) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = n))
Two continuous variable
ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price))
ggplot(data = diamonds) + geom_point( mapping = aes(x = carat, y = price), alpha = 1 / 100 )
using bin2d
ggplot(data = smaller) + geom_bin2d(mapping = aes(x = carat, y = price))
using hexbin package
ggplot(data = smaller) + geom_hex(mapping = aes(x = carat, y = price))
using a continuous variable like a categorical variable
cut_width(x, width), as used here, divides x into bins of width
width
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
Summarize by size of distribution or number of point.
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_number(carat, 20)))
Patterns and Models
ggplot(data = faithful) + geom_point(mapping = aes(x = eruptions, y = waiting))
The following code fits a model that predicts
price from carat and then computes the residuals
The residuals give us a view of the price of the diamond,
once the effect of carat has been removed:
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>% add_residuals(mod) %>% mutate(resid = exp(resid))
ggplot(data = diamonds2) + geom_point(mapping = aes(x = carat, y = resid))
Once you’ve removed the strong relationship between carat and
price, you can see what you expect in the relationship between cut
and price—relative to their size, better quality diamonds are more
expensive:
ggplot(data = diamonds2) + geom_boxplot(mapping = aes(x = cut, y = resid))
Other ggplot2 funtion
ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_freqpoly(binwidth = 0.25)
Rewriting the previous plot more concisely yields:
ggplot(faithful, aes(eruptions)) + geom_freqpoly(binwidth = 0.25)
diamonds %>% count(cut, clarity) %>% ggplot(aes(clarity, cut, fill = n)) + geom_tile()