Marshhhhh / first

0 stars 0 forks source link

Scoring #3

Open Marshhhhh opened 8 years ago

Marshhhhh commented 8 years ago

setwd("C:\Users\qwerty\Desktop") tab <- read.table("data_R.txt", sep=";",dec=",", head=TRUE) tab$A1 <- factor(tab$A1)

cor(tab, method = "spearman") cor(tab, method = "kendall")

mylogit <- glm(F~A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11+A12+A13+A14+A15+A16+A17+A18+A19+A20+A21+A22, data = tab, family = "binomial") summary(mylogit) confint(mylogit)

install.packages("pROC") library(pROC)

setwd("C:\Users\qwerty\Desktop\S") data_base <- read.table("data.txt", sep=";",dec=",", head=TRUE)

for(i in 1:5) { assign(paste("A1",".",i,sep=""), rep(0, length(data_base$A1)))} A1.1[data_base$A1==1] <- 1 A1.2[data_base$A1==2] <- 2 A1.3[data_base$A1==3] <- 3 A1.4[data_base$A1==4] <- 4

for(i in 1:14) { assign(paste("A2",".",i,sep=""), rep(0, length(data_base$A2)))} A2.1[data_base$A2==1] <- 1 A2.2[data_base$A2==2] <- 2 A2.3[data_base$A2==3] <- 3 A2.4[data_base$A2==4] <- 4 A2.5[data_base$A2==5] <- 5 A2.6[data_base$A2==6] <- 6 A2.7[data_base$A2==7] <- 7 A2.8[data_base$A2==8] <- 8 A2.9[data_base$A2==9] <- 9 A2.10[data_base$A2==10] <- 10 A2.11[data_base$A2==11] <- 11 A2.12[data_base$A2==12] <- 12 A2.13[data_base$A2==13] <- 13 A2.14[data_base$A2==14] <- 14

for(i in 1:18) { assign(paste("A3",".",i,sep=""), rep(0, length(data_base$A3)))} A3.1[data_base$A3==1] <- 1 A3.2[data_base$A3==2] <- 2 A3.3[data_base$A3==3] <- 3 A3.4[data_base$A3==4] <- 4 A3.5[data_base$A3==5] <- 5 A3.6[data_base$A3==6] <- 6 A3.7[data_base$A3==7] <- 7 A3.8[data_base$A3==8] <- 8 A3.9[data_base$A3==9] <- 9 A3.10[data_base$A3==10] <- 10 A3.11[data_base$A3==11] <- 11 A3.12[data_base$A3==12] <- 12 A3.13[data_base$A3==13] <- 13 A3.14[data_base$A3==14] <- 14 A3.15[data_base$A3==15] <- 15 A3.16[data_base$A3==16] <- 16 A3.17[data_base$A3==17] <- 17 A3.18[data_base$A3==18] <- 18

for(i in 1:23) { assign(paste("A6",".",i,sep=""), rep(0, length(data_base$A6)))} A6.1[data_base$A6==1] <- 1 A6.2[data_base$6==2] <- 2 A6.3[data_base$A6==3] <- 3 A6.4[data_base$A6==4] <- 4 A6.5[data_base$A6==5] <- 5 A6.6[data_base$A6==6] <- 6 A6.7[data_base$A6==7] <- 7 A6.8[data_base$A6==8] <- 8 A6.9[data_base$A6==9] <- 9 A6.10[data_base$A6==10] <- 10 A6.11[data_base$A6==11] <- 11 A6.12[data_base$A6==12] <- 12 A6.13[data_base$A6==13] <- 13 A6.14[data_base$A6==14] <- 14 A6.15[data_base$A6==15] <- 15 A6.16[data_base$A6==16] <- 16 A6.17[data_base$A6==17] <- 17 A6.18[data_base$A6==18] <- 18 A6.19[data_base$A6==19] <- 19 A6.20[data_base$A6==20] <- 20 A6.21[data_base$A6==21] <- 21 A6.22[data_base$A6==22] <- 22 A6.23[data_base$A6==23] <- 23

for(i in 1:3) { assign(paste("A7",".",i,sep=""), rep(0, length(data_base$A7)))} A7.1[data_base$A7==1] <- 1 A7.2[data_base$A7==2] <- 2 A7.3[data_base$A7==3] <- 3

for(i in 1:4) { assign(paste("A10",".",i,sep=""), rep(0, length(data_base$A10)))} A10.1[data_base$A10==1] <- 1 A10.2[data_base$A10==2] <- 2

for(i in 1:3) { assign(paste("A11",".",i,sep=""), rep(0, length(data_base$A11)))} A11.1[data_base$A11==1] <- 1 A11.2[data_base$A11==2] <- 2 A11.3[data_base$A11==3] <- 3

for(i in 1:8) { assign(paste("A13",".",i,sep=""), rep(0, length(data_base$A13)))} A13.1[data_base$A13==1] <- 1 A13.2[data_base$A13==2] <- 2 A13.3[data_base$A13==3] <- 3 A13.4[data_base$A13==4] <- 4 A13.5[data_base$A13==5] <- 5 A13.6[data_base$A13==6] <- 6 A13.7[data_base$A13==7] <- 7 A13.8[data_base$A13==8] <- 8

for(i in 1:3) { assign(paste("A14",".",i,sep=""), rep(0, length(data_base$A14)))} A14.1[data_base$A14==1] <- 1 A14.2[data_base$A14==2] <- 2 A14.3[data_base$A14==3] <- 3

for(i in 1:2) { assign(paste("A15",".",i,sep=""), rep(0, length(data_base$A15)))} A15.1[data_base$A15==1] <- 1 A15.2[data_base$A15==2] <- 2

for(i in 1:2) { assign(paste("A16",".",i,sep=""), rep(0, length(data_base$A16)))} A16.1[data_base$A16==1] <- 1 A16.2[data_base$A16==2] <- 2

for(i in 1:2) { assign(paste("A17",".",i,sep=""), rep(0, length(data_base$A17)))} A17.1[data_base$A17==1] <- 1 A17.2[data_base$A17==2] <- 2

for(i in 1:2) { assign(paste("A20",".",i,sep=""), rep(0, length(data_base$A20)))} A20.1[data_base$A20==1] <- 1 A20.2[data_base$A20==2] <- 2

for(i in 1:2) { assign(paste("A21",".",i,sep=""), rep(0, length(data_base$A21)))} A21.1[data_base$A21==1] <- 1 A21.2[data_base$A21==2] <- 2

corel<-cor(data_base, method = "spearman") write.table(file="C:\Users\qwerty\Desktop\S\correlation.csv", corel)

mylogit <- glm(F~ A1.1+A1.2+A1.3+A1.4 +A2.1+A2.2+A2.3+A2.4+A2.5+A2.6+A2.7+A2.8+A2.9+A2.10+A2.11+A2.12+A2.13+A2.14 +A3.1+A3.2+A3.3+A3.4+A3.5+A3.6+A3.7+A3.8+A3.9+A3.10+A3.11+A3.12+A3.13+A3.14+A3.15+A3.16+A3.17+A3.18 +A6.1+A6.2+A6.3+A6.4+A6.5+A6.6+A6.7+A6.8+A6.9+A6.10+A6.11+A6.12+A6.13+A6.14+A6.15+A6.16+A6.17+A6.18+A6.19+A6.20+A6.21+A6.22+A6.23 +A7.1+A7.2+A7.3 +A8 +A10.1+A10.2 +A11.1+A11.2+A11.3 +A13.1+A13.2+A13.3+A13.4+A13.5+A13.6+A13.7+A13.8 +A14.1+A14.2+A14.3 +A15.1+A15.2 +A16.1+A16.2 +A17.1+A17.2 +A20.1+A20.2 +A21.1+A21.2, data = data_base, family = "binomial")

mylogit_tab_1 <- summary(mylogit) mylogit_tab_2 <- confint(mylogit) write.table(file="C:\Users\qwerty\Desktop\S\a\tab_model_1.csv", mylogit_tab_1) write.table(file="C:\Users\qwerty\Desktop\S\a\tab_model_2.csv", mylogit_tab_2)

result<-predict(mylogit,type="response") result_all<-round(result,digits=4) write.table(file="C:\Users\qwerty\Desktop\S\a\probability.csv", result_all)

mylogit_f <- step(mylogit)

mylogit_tab_1_f <- summary(mylogit_f) mylogit_tab_2_f <- confint(mylogit_f) write.table(file="C:\Users\qwerty\Desktop\S\a\tab_model_1_f.csv", mylogit_tab_1_f) write.table(file="C:\Users\qwerty\Desktop\S\a\tab_model_2_f.csv", mylogit_tab_2_f)

result_f<-predict(mylogit_f,type="response") result_all_f<-round(result_f,digits=4) write.table(file="C:\Users\qwerty\Desktop\S\a\probability_f.csv", result_all_f)

roc_model <- roc(data_base$F, predict(mylogit, type="response"), ci=TRUE) roc_model_f <- roc(data_base$F, predict(mylogit_f, type="response"), ci=TRUE)

plot_roc <- plot.roc(roc_model) write.image(file=""C:\Users\qwerty\Desktop\S\R\plot_roc.jpg", plot_roc)

plot_roc_f <- plot.roc(roc_model_f, add=TRUE, col="blue") write.image(file=""C:\Users\qwerty\Desktop\S\R\plot_roc_f.jpg", plot_roc_f)

roc.test(roc_model, roc_model_f, method="delong") roc.test(roc_model, roc_model_f, method="bootstrap")

Marshhhhh commented 8 years ago

setwd("C:/Users/qwerty/Desktop/Скоринг/1") Data <- read.csv2("German Data.csv", header = F) str(Data)

1 = Хороший, 2 = Плохой

Cor1 <- cor(Data, method = "spearman") Cor2 <- cor(Data, method = "kendall")

for(i in 1:ncol(Cor1)){ for (j in 1:nrow(Cor1)){ Cor1[i,j] <- ifelse(abs(as.numeric(Cor1[i,j]))>0.5,Cor1[i,j],"-") } }

for(i in 1:ncol(Cor2)){ for (j in 1:nrow(Cor2)){ Cor2[i,j] <- ifelse(abs(as.numeric(Cor2[i,j]))>0.5,Cor1[i,j],"-") } }

write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/Cor1_Sp.csv", Cor1) write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/Cor2_Ken.csv", Cor2)

cc <- c(1,3,4,5,7,9,10,11,12,13,14,15,17,19,20) for (i in cc){ Data[,i] <- as.factor(Data[,i]) }

Data1 <- Data[,-c(2,16,17,18)]

1 = Хороший, 2 = Плохой

Data1$V21 <- Data1$V21-1 mylogit <- glm(V21~., data = Data1, family = "binomial") mylogit_tab_1 <- summary(mylogit) mylogit_tab_2 <- confint(mylogit)

write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/tab_model_1.csv", mylogit_tab_1) write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/tab_model_2.csv", mylogit_tab_2)

result<-predict(mylogit,type="response") result_all<-round(result,digits=4) write.table(file="C:\Users\qwerty\Desktop\S\a\probability.csv", result_all)

mylogit_f <- step(mylogit)

mylogit_tab_1_f <- summary(mylogit_f) mylogit_tab_2_f <- confint(mylogit_f) write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/tab_model_1_f.csv", mylogit_tab_1_f) write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/tab_model_2_f.csv", mylogit_tab_2_f)

result_f<-predict(mylogit_f,type="response") result_all_f<-round(result_f,digits=4) write.table(file="C:\Users\qwerty\Desktop\S\a\probability_f.csv", result_all_f)

library(pROC) roc_model <- roc(Data$V21, predict(mylogit, type="response"), ci=TRUE) roc_model_f <- roc(data_base$F, predict(mylogit_f, type="response"), ci=TRUE)

plot_roc <- plot.roc(roc_model) summary(plot_roc) write.image(file=""C:\Users\qwerty\Desktop\S\R\plot_roc.jpg", plot_roc)

        plot_roc_f <- plot.roc(roc_model_f, add=TRUE, col="blue")
        write.image(file=""C:\Users\qwerty\Desktop\S\R\plot_roc_f.jpg", plot_roc_f)

roc.test(roc_model, roc_model_f, method="delong") roc.test(roc_model, roc_model_f, method="bootstrap")

#########

Marshhhhh commented 8 years ago

Чистый Герман

setwd("C:/Users/shirobokova_ma/Desktop/аспирин/конференции/ИЭиУ 2016/German") Data0 <- read.csv2("German Data.csv", header = T) Data <- read.csv2("Data.csv", header = T) str(Data)

1 = Хороший, 2 = Плохой

Data$D <- Data$D-1

корреляции, удаление взаимосвязных и незначимых переменных

Cor1_0 <- cor(Data0, method = "spearman") Cor2_0 <- cor(Data0, method = "kendall")

Cor1 <- cor(Data, method = "spearman") Cor2 <- cor(Data, method = "kendall")

for(i in 1:ncol(Cor1)){ for (j in 1:nrow(Cor1)){ Cor1[i,j] <- ifelse(abs(as.numeric(Cor1[i,j]))>0.3,Cor1[i,j],"-") } }

for(i in 1:ncol(Cor2)){ for (j in 1:nrow(Cor2)){ Cor2[i,j] <- ifelse(abs(as.numeric(Cor2[i,j]))>0.3,Cor2[i,j],"-") } }

write.table(file="C:/Users/shirobokova_ma/Desktop/аспирин/конференции/ИЭиУ 2016/German/Cor1_Sp.csv", Cor1, sep=";") write.table(file="C:/Users/shirobokova_ma/Desktop/аспирин/конференции/ИЭиУ 2016/German/Cor2_Ken.csv", Cor2, sep=";")

cc <- c(1,3,4,5,7,9,10,11,12,13,14,15,17,19,20) for (i in cc){ Data[,i] <- as.factor(Data[,i]) }

Data1 <- Data[,-c(2,8,16,17,18,20)]

1 = Хороший, 2 = Плохой

разбиение на тестовую и обучающую выборки

S <- sample(nrow(Data1),size=700,replace=F) Data1$Sample <- ifelse(row.names(Data1)%in%S,1,0) D1 <- subset(Data1,Data1$Sample==1)[,-ncol(Data1)] #train sample D2 <- subset(Data1,Data1$Sample==0)[,-ncol(Data1)] #test sample

построение модели

mylogit <- glm(D~., data = D1, family=binomial(link='logit')) mylogit_tab <- summary(mylogit)

ss <- coef(summary(mylogit)) ss_sig <- subset(ss,ss[,"Pr(>|z|)"]<0.1)

write.table(file="C:/Users/shirobokova_ma/Desktop/аспирин/конференции/ИЭиУ 2016/German/tab_model_2.csv", ss_sig, sep=";")

валидация: ROC-кривая + тестовая выборка

library(pROC) roc_model <- roc(D1$D, predict(mylogit, type="response"), ci=TRUE) plot_roc <- plot.roc(roc_model)

roc_model_test <- roc(D2$D, predict(mylogit, newdata=D2, type="response"), ci=TRUE)
plot_roc_test <- plot.roc(roc_model_test, add=TRUE, col="blue")

par(mfrow = c(1, 1)) par(font=6, font.lab=6, font.main=6,cex.lab=1,cex.axis=1) plot(plot_roc,ylim=c(0,1),xlab="Специфичность", ylab="Чувствительность", main=list("ROC-кривая", cex = 1,font = 1), type=1) abline(h = seq(0,1,0.05), v = seq(0,1,0.05), col = "lightgray", lty=3) abline(a=1, b=-1, col = "gray60") lines(plot_roc, type="l",lwd="2",col="darkblue") lines(plot_roc_test, type="l",lwd="2",col="red") leg <- c(paste0("AUC_train = ",round(plot_roc$auc,digits=2), "\n","Gini_train = ",(round(plot_roc$auc,digits=2)-0.5)_2,"\n"), paste0("AUC_test = ",round(plot_roc_test$auc,digits=2), "\n","Gini_test = ",(round(plot_roc_test$auc,digits=2)-0.5)_2))

legend(0.5,0.5,cex = 0.8,bty="n",lwd=2,lty=c(1,1),col=c("darkblue","red"), legend=leg)

#########

Marshhhhh commented 8 years ago

ROC curve(s) with ROCR

Chupakhin Vladimir (chupvl@gmail.com)

loading ROCR library

library("ROCR")

loading active compounds, or compounds with label1

active <- read.table("sample.active", sep=",", header=FALSE)

loading inactive compounds, or compounds with label2

inactive <- read.table("sample.inactive", sep=",", header=FALSE)

binding them and converting to matrix because ROCR works with matrix data

target_pred <- as.matrix(rbind(active,inactive))

because number of the colums should be the same - making additional param

ncol <- ncol(inactive)

generating classes (1 for active, 0 for inactive, but it can be 1 and -1 - there is no difference)

class.active <- matrix(sample(1, (ncol(active)_nrow(active)), replace=T), ncol=ncol) class.inactive <- matrix(sample(0, (ncol(inactive)_nrow(inactive)), replace=T), ncol=ncol)

binding the classes

target_class <- rbind(class.active,class.inactive)

target_class1 <- target_class[,1]

calculating the values for ROC curve

pred <- prediction(target_pred, target_class) perf <- performance(pred,"tpr","fpr")

changing params for the ROC plot - width, etc

par(mar=c(5,5,2,2),xaxs = "i",yaxs = "i",cex.axis=1.3,cex.lab=1.4)

plotting the ROC curve

plot(perf,col="black",lty=3, lwd=3)

calculating AUC

auc <- performance(pred,"auc")

now converting S4 class to vector

auc <- unlist(slot(auc, "y.values"))

adding min and max ROC AUC to the center of the plot

minauc<-min(round(auc, digits = 2)) maxauc<-max(round(auc, digits = 2)) minauct <- paste(c("min(AUC) = "),minauc,sep="") maxauct <- paste(c("max(AUC) = "),maxauc,sep="") legend(0.3,0.6,c(minauct,maxauct,"\n"),border="white",cex=1.7,box.col = "white") #

Marshhhhh commented 7 years ago

https://www.r-bloggers.com/how-to-perform-a-logistic-regression-in-r/ http://www.r-tutor.com/gpu-computing/gaussian-process/rvbm http://www.algorithmist.ru/2011/11/regularization-with-examples-in-r.html

Marshhhhh commented 7 years ago

sourse("reader.R")

reader=function(path=path,h=T) { split_path=strsplit(path,"/") file=split_path[[1]][length(split_path[[1]])] dir_path=paste(split_path[[1]][1:(length(split_path[[1]])-1)],collapse="/") setwd(dir_path) f1=read.table;f2=read.csv;f3=read.csv2;f4=read.delim;f5=read.delim2; for(f in c(f1,f2,f3,f4,f5)) { if(inherits(try(f(file,h=h,nrow=1),silent=T),"try-error")==T) { next } else { file_1_row=f(file,h=h,nrow=1,dec=".") split_file=strsplit(paste(names(file_1_row),collapse=""),"")[[1]] split_file_letters=split_file[-grep("\.|0|1|2|3|4|5|6|7|8|9|\,|;|:|\)|\)",split_file)] letters_in_alphabet=names(table(tolower(split_file_letters))) %in% strsplit("йцукенгшщзхъэждлорпавыфячсмитьбюqwertyuioplkjhgfdsazxcvbnm","")[[1]] if(dim(file_1_row)[2]>1) { file_1_row=f(file,h=T,nrow=1,dec=".") num_row_dot=NULL num_row_comma=NULL for(i in 1:length(file_1_row)){ if(is.numeric(file_1_row[,i])==T){ num_row_dot=append(num_row_dot,i)}} file_1_row2=f(file,h=T,nrow=1,dec=",") for(i in 1:length(file_1_row2)){ if(is.numeric(file_1_row2[,i])==T){ num_row_comma=append(num_row_comma,i)}} decimal=ifelse(length(num_row_dot)>length(num_row_comma),".",",")

if(length(letters_in_alphabet[letters_in_alphabet==T])/length(letters_in_alphabet)>0.9) { return(f(file,h=h,dec=decimal)) } else { return(f(file,h=h,encoding="UTF-8",dec=decimal)) } } } } }