Open woneuy01 opened 4 years ago
x <- c(3, 3, 3, 3, 4, 4, 2) length(unique(x)) ## how many unique variable tab<-table(x) # counting unique value
library(dslabs) data(heights) tab <- table(heights$height) sum(tab==1) # count number of cases which has number 1 in the table
prop.table(table(height$sex)) Female Male 0.227 0.773
library(tidyverse) library(dslabs) data(heights) index <- heights$sex=="Male" x <- heights$height[index]
average <- sum(x)/length(x) SD <- sqrt(sum(x - average)^2)/length(x)
average <- mean(x) SD <- sd(x) c(average = average, SD = SD)
z <- scale(x)
mean(abs(z) < 2)
Note about the sd function: The built-in R function sd calculates the standard deviation, but it divides by length(x)-1 instead of length(x). sample!! not a population. When the length of the list is large, this difference is negligible and you can use the built-in sd function. Otherwise, you should compute σ by hand.
library(dslabs) data(heights) x <- heights$height[heights$sex == "Male"] m<-mean(x) a1<-(69-m)/sqrt(sum((x-m)^2)/length(m)) a2<-(72-m)/sqrt(sum((x-m)^2)/length(m)) mean(x>69 & x<=72)
Given a normal distribution with a mean mu and standard deviation sigma, you can calculate the proportion of observations less than or equal to a certain value with pnorm(value, mu, sigma). Notice that this is the CDF for the normal distribution.
library(dslabs) data(heights) x <- heights$height[heights$sex=="Male"] avg <- mean(x) stdev <- sd(x) a1<-pnorm(69,avg,stdev) a2<-pnorm(72,avg,stdev) a2-a1
with approx normal distribution pnorm(72,avg,stdev)
When there is the data is not normal distribution library(dslabs) data(heights) x <- heights$height[heights$sex == "Male"] exact <- mean(x > 79 & x <= 81) (real proportion calculation) mu <- mean(x) stdev <-sd(x) approx<- pnorm(81,mu,stdev)-pnorm(79,mu,stdev) (normal distribution approx calculation) exact/approx #since actually it is far from normal distribution the value is larger than 1
p <- (1-pnorm(712,69,3)) round(p1000000000)
Quantile-quantile plots, or QQ-plots, are used to check whether distributions are well-approximated by a normal distribution.
library(tidyverse) library(dslabs) data(heights) index <- heights$sex=="Male" x <- heights$height[index] z <- scale(x)
mean(x <= 69.5)
p <- seq(0.05, 0.95, 0.05) observed_quantiles <- quantile(x, p) theoretical_quantiles <- qnorm(p, mean = mean(x), sd = sd(x))
plot(theoretical_quantiles, observed_quantiles) abline(0,1)
observed_quantiles <- quantile(z, p) theoretical_quantiles <- qnorm(p) plot(theoretical_quantiles, observed_quantiles) abline(0,1)
library(dslabs) data(heights) quantile(heights$height, seq(.01, 0.99, 0.01))
library(dslabs) data(heights) male <- heights$height[heights$sex=="Male"] female <- heights$height[heights$sex=="Female"] female_percentiles <-quantile(female,seq(0.1,0.9,0.2)) male_percentiles <- quantile( male, seq(0.1,0.9,0.2)) df<-data.frame(female=female_percentiles, male=male_percentiles) print(df)
Median absolute deviation (MAD)
x <- Galton$child error_avg <- function(k){ x[1]<-k return (mean(x)) } error_avg(10000) error_avg(-10000)
library(dslabs) data(heights) # checking data type names(heights)