Marshhhhh / first

0 stars 0 forks source link

kmeans #13

Open Marshhhhh opened 7 years ago

Marshhhhh commented 7 years ago

library(ggplot2) d <- iris[, c("Sepal.Length", "Petal.Width")]

fit <- kmeans(d, 3) d$clusters <- factor(fit$cluster)

ggplot(d, aes(Sepal.Length, Petal.Width, col = clusters))+ geom_point(size = 2)+ theme_bw()

https://www.naftaliharris.com/blog/visualizing-k-means-clustering/

Marshhhhh commented 7 years ago

library(ggplot2) library(ggrepel) # для симпатичной подписи точек на графике

x <- rnorm(10) y <- rnorm(10) test_data <- data.frame(x, y) test_data$labels <- 1:10

ggplot(test_data, aes(x, y, label = labels))+ geom_point()+ geom_text_repel()

d = dist(test_data) fit <- hclust(d, method = "single") plot(fit, labels = test_data$labels) rect.hclust(fit, 2) # укажите желаемое число кластеров, сейчас стоит 2

Marshhhhh commented 7 years ago

library(ape) set.seed(222) tr <- rtree(20, tip.label = c("B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U"))

левое дерево

plot.phylo(tr)

правое дерево

plot.phylo(tr, use.edge.length=FALSE)

Marshhhhh commented 7 years ago

smart_hclust <- function(data, nclust){ dist_matrix <- dist(data) # расчет матрицы расстояний fit <- hclust(dist_matrix) # иерархическая кластеризация data$cluster <- as.factor(cutree(fit, nclust)) return(data) }

Marshhhhh commented 7 years ago

test_data <- read.csv("https://stepic.org/media/attachments/course/524/test_data_hclust.csv") test_data <- as.data.frame(list(X1 = c(13, 6, 16, 20, 31, 24), X2 = c(7, 14, 20, 18, 27, 27), X3 = c(11, 5, 20, 22, 28, 30), X4 = c(7, 12, 20, 16, 33, 32))) str(test_data) 'data.frame': 12 obs. of 5 variables: $ X1: int 11 9 9 9 7 9 16 23 15 19 ... $ X2: int 7 10 2 11 9 11 20 18 21 20 ... $ X3: int 10 10 12 8 10 9 22 21 14 15 ... $ X4: int 10 8 14 10 11 6 19 24 21 17 ... $ X5: int 8 6 11 3 14 9 16 16 21 17 ...

dist_matrix <- dist(swiss) # расчет матрицы расстояний fit <- hclust(dist_matrix) # иерархическая кластеризация cluster <- cutree(fit, 3) # номер кластера для каждого наблюдения

smart_hclust <- function(data, nclust){ dist_matrix <- dist(data) # расчет матрицы расстояний fit <- hclust(dist_matrix) # иерархическая кластеризация data$cluster <- as.factor(cutree(fit, nclust)) return(data) }

smart_hclust(test_data, 3) # выделено три кластера X1 X2 X3 X4 X5 cluster 1 11 7 10 10 8 1 2 9 10 10 8 6 1 3 9 2 12 14 11 1 4 9 11 8 10 3 1 5 7 9 10 11 14 1 6 9 11 9 6 9 1 7 16 20 22 19 16 2 8 23 18 21 24 16 2 9 15 21 14 21 21 3 10 19 20 15 17 17 3 11 20 24 21 20 19 2 12 22 19 27 22 19 2

data <- test_data nclust <- 2

test_data <- read.csv("https://stepic.org/media/attachments/course/524/cluster_1.csv")

get_difference <- function(data, nclust){ dist_matrix <- dist(data) # расчет матрицы расстояний fit <- hclust(dist_matrix) # иерархическая кластеризация data$cluster <- factor(cutree(fit, k = nclust)) aov_x <- apply(data[,colnames(data)!="cluster"],2,function(x){ s <- summary(aov(x~cluster, data)) s <- s[[1]]$'Pr(>F)'[[1]] }) return(names(aov_x[aov_x<0.05])) }

s <- aov_x[1] s$

get_difference(test_data, 2)

test_data <- read.csv("https://stepic.org/media/attachments/course/524/pca_test.csv")

test_data V1 V2 V3 V4 V5 1 13 15 12 13 12 2 16 11 8 12 6 3 15 7 10 12 13 4 12 11 6 6 4 5 11 13 13 10 12

get_pca2(test_data) V1 V2 V3 V4 V5 PC1 PC2 1 13 15 12 13 12 -4.5 2.4 2 16 11 8 12 6 3.0 -1.9 3 15 7 10 12 13 -2.8 -5.1 4 12 11 6 6 4 7.8 1.7 5 11 13 13 10 12 -3.6 3.0

data <- test_data

get_pc <- function(data){ comp <- prcomp(data)$x[,1:2] return(cbind(data,comp)) }

get_pca2 <- function(data){ comp <- summary(prcomp(data)) a <- comp$importance[3,]>0.9 a <- comp$x[,1:(length(which(a==F))+1)] return(cbind(data,a)) }

test_data <- read.csv("https://stepic.org/media/attachments/course/524/Norris_1.csv") V1 V2 V3 V4 1 22 20 18 20 2 16 28 31 15 3 14 24 7 16

is_multicol(test_data) [1] "There is no collinearity in the data"

is_multicol <- function(test_data) { c <- cor(test_data) diag(c) <- 0 eq <- function(x, y) {isTRUE(all.equal(x, y))} nm <- apply(c,1,function(x) mapply(eq,abs(x),rep(1,ncol(c)))) nm <- apply(nm,2,function(x)T%in%x)
nm <- nm[nm==T] if (length(nm)==0){ res <- c("There is no collinearity in the data") } else { res <- names(nm) }

return(res) }

isTRUE(all.equal(0.1+0.05, 0.15)) all.equal(as.vector(c[,3]),rep(1,7))

test_data <- read.csv("https://stepic.org/media/attachments/course/524/Norris_2.csv") V1 V2 V3 V4 1 13 12 7 11 2 15 14 13 10 3 8 7 11 16 is_multicol(test_data) [1] "V2" "V1"

test_data <- as.data.frame(list(V1 = c(16, 12, 2, 5, 16), V2 = c(5, 3, -1, 14, 9), V3 = c(17, 12, 19, 7, 11), V4 = c(18, 8, 12, 30, 1), V5 = c(-18, 2, -6, -3, -7), V6 = c(24, 4, 12, 9, 13), V7 = c(13, 8, 15, 3, 7)))

is_multicol

f <- lm(V1~V2,test_data) sum(f$coefficients[1]+f$coefficients[2]*test_data$V2 == test_data$V1) == nrow(test_data)

data <- swiss

smart_hclust <- function(data = swiss, nclust = 2){ swiss dist_matrix <- dist(data) # расчет матрицы расстояний fit <- hclust(dist_matrix) # иерархическая кластеризация data$cluster <- as.factor(cutree(fit, nclust)) obj <- ggplot(data, aes(x=Education, y=Catholic, col=cluster))+
geom_smooth(method = "lm")+geom_point()

geom_density(alpha = 0.2) 

library(ggplot2)

swiss