chapter 5 exploratory data analysis

library(tidyverse) install.packages("ggstance") library(ggstance) install.packages("lvplot") library(lvplot) install.packages("hexbin") library(modelr)

examine a categorical variable

ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut))

acount categorical variable

diamonds %>% count(cut)

examine a continuous variable

ggplot(data = diamonds) + geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

computing a continuous variable

diamonds %>% count(cut_width(carat, 0.5))

testing diferent kinds of binwidth

smaller <- diamonds %>% filter(carat < 3)

ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.1)

overlay multiples histogram on one plot

ggplot(data = smaller, mapping = aes(x = carat, color = cut)) + geom_freqpoly(binwidth = 0.1)

e.g of clustering

ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_histogram(binwidth = 0.25)

Making zoom on y axis

ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5) + coord_cartesian(ylim = c(0, 50))

with the information obtained on previous

filter the values in 0, ~30 and ~60

unusual <- diamonds %>% filter(y < 3 | y > 20) %>% arrange(y)

unusual

replace the unusual values

diamonds2 <- diamonds %>% mutate(y = ifelse(y < 3 | y > 20, NA, y))

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point()

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point(na.rm = TRUE)

nycflights13::flights %>% mutate( cancelled = is.na(dep_time), sched_hour = sched_dep_time %/% 100, sched_min = sched_dep_time %% 100, sched_dep_time = sched_hour + sched_min / 60 ) %>% ggplot(mapping = aes(sched_dep_time)) + geom_freqpoly( mapping = aes(color = cancelled), binwidth = 1/4 )

The default appearance of geom_freqpoly() is

not that useful for that sort of comparison because the height is

given by the count. That means if one of the groups is much smaller

than the others, it’s hard to see the differences in shape. For example,

let’s explore how the price of a diamond varies with its quality:

ggplot(data = diamonds, mapping = aes(x = price)) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(diamonds) + geom_bar(mapping = aes(x = cut))

ggplot( data = diamonds, mapping = aes(x = price, y = ..density..) ) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_boxplot()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()

reordering

ggplot(data = mpg) + geom_boxplot( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) )

turn

ggplot(data = mpg) + geom_boxplot( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) ) + coord_flip()

ggplot(data = mpg) + geom_boxplot(mapping = aes(x = class,y = hwy))

exercises

ggplot(data = diamonds, aes(x = price, y = cut)) + geom_lv(mapping = aes(color = cut))

p <- ggplot(mpg, aes(class, hwy)) p + geom_lv(aes(fill=..LV..)) + scale_fill_brewer() p + geom_lv() + geom_jitter(width = 0.2) p + geom_lv(alpha=1, aes(fill=..LV..)) + scale_fill_lv()

d <- ggplot(diamonds, aes(cut, price))

d + geom_lv() d+geom_violin()

ggplot(data = mpg) + geom_violin( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) )

two categorical variables

ggplot(data = diamonds) + geom_count(mapping = aes(x = cut, y = color))

diamonds %>% count(color, cut) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = n))

Two continuous variable

ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price))

ggplot(data = diamonds) + geom_point( mapping = aes(x = carat, y = price), alpha = 1 / 100 )

using bin2d

ggplot(data = smaller) + geom_bin2d(mapping = aes(x = carat, y = price))

using hexbin package

ggplot(data = smaller) + geom_hex(mapping = aes(x = carat, y = price))

using a continuous variable like a categorical variable

cut_width(x, width), as used here, divides x into bins of width

width

ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))

Summarize by size of distribution or number of point.

ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_number(carat, 20)))

Patterns and Models

ggplot(data = faithful) + geom_point(mapping = aes(x = eruptions, y = waiting))

The following code fits a model that predicts

price from carat and then computes the residuals

The residuals give us a view of the price of the diamond,

once the effect of carat has been removed:

mod <- lm(log(price) ~ log(carat), data = diamonds)

diamonds2 <- diamonds %>% add_residuals(mod) %>% mutate(resid = exp(resid))

ggplot(data = diamonds2) + geom_point(mapping = aes(x = carat, y = resid))

Once you’ve removed the strong relationship between carat and

price, you can see what you expect in the relationship between cut

and price—relative to their size, better quality diamonds are more

expensive:

ggplot(data = diamonds2) + geom_boxplot(mapping = aes(x = cut, y = resid))

Other ggplot2 funtion

ggplot(data = faithful, mapping = aes(x = eruptions)) + geom_freqpoly(binwidth = 0.25)

Rewriting the previous plot more concisely yields:

ggplot(faithful, aes(eruptions)) + geom_freqpoly(binwidth = 0.25)

diamonds %>% count(cut, clarity) %>% ggplot(aes(clarity, cut, fill = n)) + geom_tile()

daramireh / rfordatasciencebook

End Part I EDA Chapter 5 #3