setwd("C:\Users\qwerty\Desktop") tab <- read.table("data_R.txt", sep=";",dec=",", head=TRUE) tab$A1 <- factor(tab$A1)

cor(tab, method = "spearman") cor(tab, method = "kendall")

mylogit <- glm(F~A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11+A12+A13+A14+A15+A16+A17+A18+A19+A20+A21+A22, data = tab, family = "binomial") summary(mylogit) confint(mylogit)

install.packages("pROC") library(pROC)

setwd("C:\Users\qwerty\Desktop\S") data_base <- read.table("data.txt", sep=";",dec=",", head=TRUE)

for(i in 1:5) { assign(paste("A1",".",i,sep=""), rep(0, length(data_base$A1)))} A1.1[data_base$A1==1] <- 1 A1.2[data_base$A1==2] <- 2 A1.3[data_base$A1==3] <- 3 A1.4[data_base$A1==4] <- 4

for(i in 1:14) { assign(paste("A2",".",i,sep=""), rep(0, length(data_base$A2)))} A2.1[data_base$A2==1] <- 1 A2.2[data_base$A2==2] <- 2 A2.3[data_base$A2==3] <- 3 A2.4[data_base$A2==4] <- 4 A2.5[data_base$A2==5] <- 5 A2.6[data_base$A2==6] <- 6 A2.7[data_base$A2==7] <- 7 A2.8[data_base$A2==8] <- 8 A2.9[data_base$A2==9] <- 9 A2.10[data_base$A2==10] <- 10 A2.11[data_base$A2==11] <- 11 A2.12[data_base$A2==12] <- 12 A2.13[data_base$A2==13] <- 13 A2.14[data_base$A2==14] <- 14

for(i in 1:18) { assign(paste("A3",".",i,sep=""), rep(0, length(data_base$A3)))} A3.1[data_base$A3==1] <- 1 A3.2[data_base$A3==2] <- 2 A3.3[data_base$A3==3] <- 3 A3.4[data_base$A3==4] <- 4 A3.5[data_base$A3==5] <- 5 A3.6[data_base$A3==6] <- 6 A3.7[data_base$A3==7] <- 7 A3.8[data_base$A3==8] <- 8 A3.9[data_base$A3==9] <- 9 A3.10[data_base$A3==10] <- 10 A3.11[data_base$A3==11] <- 11 A3.12[data_base$A3==12] <- 12 A3.13[data_base$A3==13] <- 13 A3.14[data_base$A3==14] <- 14 A3.15[data_base$A3==15] <- 15 A3.16[data_base$A3==16] <- 16 A3.17[data_base$A3==17] <- 17 A3.18[data_base$A3==18] <- 18

for(i in 1:23) { assign(paste("A6",".",i,sep=""), rep(0, length(data_base$A6)))} A6.1[data_base$A6==1] <- 1 A6.2[data_base$6==2] <- 2 A6.3[data_base$A6==3] <- 3 A6.4[data_base$A6==4] <- 4 A6.5[data_base$A6==5] <- 5 A6.6[data_base$A6==6] <- 6 A6.7[data_base$A6==7] <- 7 A6.8[data_base$A6==8] <- 8 A6.9[data_base$A6==9] <- 9 A6.10[data_base$A6==10] <- 10 A6.11[data_base$A6==11] <- 11 A6.12[data_base$A6==12] <- 12 A6.13[data_base$A6==13] <- 13 A6.14[data_base$A6==14] <- 14 A6.15[data_base$A6==15] <- 15 A6.16[data_base$A6==16] <- 16 A6.17[data_base$A6==17] <- 17 A6.18[data_base$A6==18] <- 18 A6.19[data_base$A6==19] <- 19 A6.20[data_base$A6==20] <- 20 A6.21[data_base$A6==21] <- 21 A6.22[data_base$A6==22] <- 22 A6.23[data_base$A6==23] <- 23

for(i in 1:3) { assign(paste("A7",".",i,sep=""), rep(0, length(data_base$A7)))} A7.1[data_base$A7==1] <- 1 A7.2[data_base$A7==2] <- 2 A7.3[data_base$A7==3] <- 3

for(i in 1:4) { assign(paste("A10",".",i,sep=""), rep(0, length(data_base$A10)))} A10.1[data_base$A10==1] <- 1 A10.2[data_base$A10==2] <- 2

for(i in 1:3) { assign(paste("A11",".",i,sep=""), rep(0, length(data_base$A11)))} A11.1[data_base$A11==1] <- 1 A11.2[data_base$A11==2] <- 2 A11.3[data_base$A11==3] <- 3

for(i in 1:8) { assign(paste("A13",".",i,sep=""), rep(0, length(data_base$A13)))} A13.1[data_base$A13==1] <- 1 A13.2[data_base$A13==2] <- 2 A13.3[data_base$A13==3] <- 3 A13.4[data_base$A13==4] <- 4 A13.5[data_base$A13==5] <- 5 A13.6[data_base$A13==6] <- 6 A13.7[data_base$A13==7] <- 7 A13.8[data_base$A13==8] <- 8

for(i in 1:3) { assign(paste("A14",".",i,sep=""), rep(0, length(data_base$A14)))} A14.1[data_base$A14==1] <- 1 A14.2[data_base$A14==2] <- 2 A14.3[data_base$A14==3] <- 3

for(i in 1:2) { assign(paste("A15",".",i,sep=""), rep(0, length(data_base$A15)))} A15.1[data_base$A15==1] <- 1 A15.2[data_base$A15==2] <- 2

for(i in 1:2) { assign(paste("A16",".",i,sep=""), rep(0, length(data_base$A16)))} A16.1[data_base$A16==1] <- 1 A16.2[data_base$A16==2] <- 2

for(i in 1:2) { assign(paste("A17",".",i,sep=""), rep(0, length(data_base$A17)))} A17.1[data_base$A17==1] <- 1 A17.2[data_base$A17==2] <- 2

for(i in 1:2) { assign(paste("A20",".",i,sep=""), rep(0, length(data_base$A20)))} A20.1[data_base$A20==1] <- 1 A20.2[data_base$A20==2] <- 2

for(i in 1:2) { assign(paste("A21",".",i,sep=""), rep(0, length(data_base$A21)))} A21.1[data_base$A21==1] <- 1 A21.2[data_base$A21==2] <- 2

corel<-cor(data_base, method = "spearman") write.table(file="C:\Users\qwerty\Desktop\S\correlation.csv", corel)

mylogit <- glm(F~ A1.1+A1.2+A1.3+A1.4 +A2.1+A2.2+A2.3+A2.4+A2.5+A2.6+A2.7+A2.8+A2.9+A2.10+A2.11+A2.12+A2.13+A2.14 +A3.1+A3.2+A3.3+A3.4+A3.5+A3.6+A3.7+A3.8+A3.9+A3.10+A3.11+A3.12+A3.13+A3.14+A3.15+A3.16+A3.17+A3.18 +A6.1+A6.2+A6.3+A6.4+A6.5+A6.6+A6.7+A6.8+A6.9+A6.10+A6.11+A6.12+A6.13+A6.14+A6.15+A6.16+A6.17+A6.18+A6.19+A6.20+A6.21+A6.22+A6.23 +A7.1+A7.2+A7.3 +A8 +A10.1+A10.2 +A11.1+A11.2+A11.3 +A13.1+A13.2+A13.3+A13.4+A13.5+A13.6+A13.7+A13.8 +A14.1+A14.2+A14.3 +A15.1+A15.2 +A16.1+A16.2 +A17.1+A17.2 +A20.1+A20.2 +A21.1+A21.2, data = data_base, family = "binomial")

mylogit_tab_1 <- summary(mylogit) mylogit_tab_2 <- confint(mylogit) write.table(file="C:\Users\qwerty\Desktop\S\a\tab_model_1.csv", mylogit_tab_1) write.table(file="C:\Users\qwerty\Desktop\S\a\tab_model_2.csv", mylogit_tab_2)

result<-predict(mylogit,type="response") result_all<-round(result,digits=4) write.table(file="C:\Users\qwerty\Desktop\S\a\probability.csv", result_all)

mylogit_f <- step(mylogit)

mylogit_tab_1_f <- summary(mylogit_f) mylogit_tab_2_f <- confint(mylogit_f) write.table(file="C:\Users\qwerty\Desktop\S\a\tab_model_1_f.csv", mylogit_tab_1_f) write.table(file="C:\Users\qwerty\Desktop\S\a\tab_model_2_f.csv", mylogit_tab_2_f)

result_f<-predict(mylogit_f,type="response") result_all_f<-round(result_f,digits=4) write.table(file="C:\Users\qwerty\Desktop\S\a\probability_f.csv", result_all_f)

roc_model <- roc(data_base$F, predict(mylogit, type="response"), ci=TRUE) roc_model_f <- roc(data_base$F, predict(mylogit_f, type="response"), ci=TRUE)

plot_roc <- plot.roc(roc_model) write.image(file=""C:\Users\qwerty\Desktop\S\R\plot_roc.jpg", plot_roc)

plot_roc_f <- plot.roc(roc_model_f, add=TRUE, col="blue") write.image(file=""C:\Users\qwerty\Desktop\S\R\plot_roc_f.jpg", plot_roc_f)

roc.test(roc_model, roc_model_f, method="delong") roc.test(roc_model, roc_model_f, method="bootstrap")

setwd("C:/Users/qwerty/Desktop/Скоринг/1") Data <- read.csv2("German Data.csv", header = F) str(Data)

1 = Хороший, 2 = Плохой

Cor1 <- cor(Data, method = "spearman") Cor2 <- cor(Data, method = "kendall")

for(i in 1:ncol(Cor1)){ for (j in 1:nrow(Cor1)){ Cor1[i,j] <- ifelse(abs(as.numeric(Cor1[i,j]))>0.5,Cor1[i,j],"-") } }

for(i in 1:ncol(Cor2)){ for (j in 1:nrow(Cor2)){ Cor2[i,j] <- ifelse(abs(as.numeric(Cor2[i,j]))>0.5,Cor1[i,j],"-") } }

write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/Cor1_Sp.csv", Cor1) write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/Cor2_Ken.csv", Cor2)

cc <- c(1,3,4,5,7,9,10,11,12,13,14,15,17,19,20) for (i in cc){ Data[,i] <- as.factor(Data[,i]) }

Data1 <- Data[,-c(2,16,17,18)]

1 = Хороший, 2 = Плохой

Data1$V21 <- Data1$V21-1 mylogit <- glm(V21~., data = Data1, family = "binomial") mylogit_tab_1 <- summary(mylogit) mylogit_tab_2 <- confint(mylogit)

write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/tab_model_1.csv", mylogit_tab_1) write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/tab_model_2.csv", mylogit_tab_2)

result<-predict(mylogit,type="response") result_all<-round(result,digits=4) write.table(file="C:\Users\qwerty\Desktop\S\a\probability.csv", result_all)

mylogit_f <- step(mylogit)

mylogit_tab_1_f <- summary(mylogit_f) mylogit_tab_2_f <- confint(mylogit_f) write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/tab_model_1_f.csv", mylogit_tab_1_f) write.table(file="C:/Users/qwerty/Desktop/Скоринг/1/tab_model_2_f.csv", mylogit_tab_2_f)

result_f<-predict(mylogit_f,type="response") result_all_f<-round(result_f,digits=4) write.table(file="C:\Users\qwerty\Desktop\S\a\probability_f.csv", result_all_f)

library(pROC) roc_model <- roc(Data$V21, predict(mylogit, type="response"), ci=TRUE) roc_model_f <- roc(data_base$F, predict(mylogit_f, type="response"), ci=TRUE)

plot_roc <- plot.roc(roc_model) summary(plot_roc) write.image(file=""C:\Users\qwerty\Desktop\S\R\plot_roc.jpg", plot_roc)

        plot_roc_f <- plot.roc(roc_model_f, add=TRUE, col="blue")
        write.image(file=""C:\Users\qwerty\Desktop\S\R\plot_roc_f.jpg", plot_roc_f)

roc.test(roc_model, roc_model_f, method="delong") roc.test(roc_model, roc_model_f, method="bootstrap")

#########

Чистый Герман

setwd("C:/Users/shirobokova_ma/Desktop/аспирин/конференции/ИЭиУ 2016/German") Data0 <- read.csv2("German Data.csv", header = T) Data <- read.csv2("Data.csv", header = T) str(Data)

1 = Хороший, 2 = Плохой

Data$D <- Data$D-1

корреляции, удаление взаимосвязных и незначимых переменных

Cor1_0 <- cor(Data0, method = "spearman") Cor2_0 <- cor(Data0, method = "kendall")

Cor1 <- cor(Data, method = "spearman") Cor2 <- cor(Data, method = "kendall")

for(i in 1:ncol(Cor1)){ for (j in 1:nrow(Cor1)){ Cor1[i,j] <- ifelse(abs(as.numeric(Cor1[i,j]))>0.3,Cor1[i,j],"-") } }

for(i in 1:ncol(Cor2)){ for (j in 1:nrow(Cor2)){ Cor2[i,j] <- ifelse(abs(as.numeric(Cor2[i,j]))>0.3,Cor2[i,j],"-") } }

write.table(file="C:/Users/shirobokova_ma/Desktop/аспирин/конференции/ИЭиУ 2016/German/Cor1_Sp.csv", Cor1, sep=";") write.table(file="C:/Users/shirobokova_ma/Desktop/аспирин/конференции/ИЭиУ 2016/German/Cor2_Ken.csv", Cor2, sep=";")

cc <- c(1,3,4,5,7,9,10,11,12,13,14,15,17,19,20) for (i in cc){ Data[,i] <- as.factor(Data[,i]) }

Data1 <- Data[,-c(2,8,16,17,18,20)]

1 = Хороший, 2 = Плохой

разбиение на тестовую и обучающую выборки

S <- sample(nrow(Data1),size=700,replace=F) Data1$Sample <- ifelse(row.names(Data1)%in%S,1,0) D1 <- subset(Data1,Data1$Sample==1)[,-ncol(Data1)] #train sample D2 <- subset(Data1,Data1$Sample==0)[,-ncol(Data1)] #test sample

построение модели

mylogit <- glm(D~., data = D1, family=binomial(link='logit')) mylogit_tab <- summary(mylogit)

ss <- coef(summary(mylogit)) ss_sig <- subset(ss,ss[,"Pr(>|z|)"]<0.1)

write.table(file="C:/Users/shirobokova_ma/Desktop/аспирин/конференции/ИЭиУ 2016/German/tab_model_2.csv", ss_sig, sep=";")

валидация: ROC-кривая + тестовая выборка

library(pROC) roc_model <- roc(D1$D, predict(mylogit, type="response"), ci=TRUE) plot_roc <- plot.roc(roc_model)

roc_model_test <- roc(D2$D, predict(mylogit, newdata=D2, type="response"), ci=TRUE)
plot_roc_test <- plot.roc(roc_model_test, add=TRUE, col="blue")

par(mfrow = c(1, 1)) par(font=6, font.lab=6, font.main=6,cex.lab=1,cex.axis=1) plot(plot_roc,ylim=c(0,1),xlab="Специфичность", ylab="Чувствительность", main=list("ROC-кривая", cex = 1,font = 1), type=1) abline(h = seq(0,1,0.05), v = seq(0,1,0.05), col = "lightgray", lty=3) abline(a=1, b=-1, col = "gray60") lines(plot_roc, type="l",lwd="2",col="darkblue") lines(plot_roc_test, type="l",lwd="2",col="red") leg <- c(paste0("AUC_train = ",round(plot_roc$auc,digits=2), "\n","Gini_train = ",(round(plot_roc$auc,digits=2)-0.5)_2,"\n"), paste0("AUC_test = ",round(plot_roc_test$auc,digits=2), "\n","Gini_test = ",(round(plot_roc_test$auc,digits=2)-0.5)_2))

legend(0.5,0.5,cex = 0.8,bty="n",lwd=2,lty=c(1,1),col=c("darkblue","red"), legend=leg)

#########

ROC curve(s) with ROCR

Chupakhin Vladimir (chupvl@gmail.com)

loading ROCR library

library("ROCR")

loading active compounds, or compounds with label1

active <- read.table("sample.active", sep=",", header=FALSE)

loading inactive compounds, or compounds with label2

inactive <- read.table("sample.inactive", sep=",", header=FALSE)

binding them and converting to matrix because ROCR works with matrix data

target_pred <- as.matrix(rbind(active,inactive))

because number of the colums should be the same - making additional param

ncol <- ncol(inactive)

generating classes (1 for active, 0 for inactive, but it can be 1 and -1 - there is no difference)

class.active <- matrix(sample(1, (ncol(active)_nrow(active)), replace=T), ncol=ncol) class.inactive <- matrix(sample(0, (ncol(inactive)_nrow(inactive)), replace=T), ncol=ncol)

binding the classes

target_class <- rbind(class.active,class.inactive)

target_class1 <- target_class[,1]

calculating the values for ROC curve

pred <- prediction(target_pred, target_class) perf <- performance(pred,"tpr","fpr")

changing params for the ROC plot - width, etc

par(mar=c(5,5,2,2),xaxs = "i",yaxs = "i",cex.axis=1.3,cex.lab=1.4)

plotting the ROC curve

plot(perf,col="black",lty=3, lwd=3)

calculating AUC

auc <- performance(pred,"auc")

now converting S4 class to vector

auc <- unlist(slot(auc, "y.values"))

adding min and max ROC AUC to the center of the plot

minauc<-min(round(auc, digits = 2)) maxauc<-max(round(auc, digits = 2)) minauct <- paste(c("min(AUC) = "),minauc,sep="") maxauct <- paste(c("max(AUC) = "),maxauc,sep="") legend(0.3,0.6,c(minauct,maxauct,"\n"),border="white",cex=1.7,box.col = "white") #

sourse("reader.R")

reader=function(path=path,h=T) { split_path=strsplit(path,"/") file=split_path[[1]][length(split_path[[1]])] dir_path=paste(split_path[[1]][1:(length(split_path[[1]])-1)],collapse="/") setwd(dir_path) f1=read.table;f2=read.csv;f3=read.csv2;f4=read.delim;f5=read.delim2; for(f in c(f1,f2,f3,f4,f5)) { if(inherits(try(f(file,h=h,nrow=1),silent=T),"try-error")==T) { next } else { file_1_row=f(file,h=h,nrow=1,dec=".") split_file=strsplit(paste(names(file_1_row),collapse=""),"")[[1]] split_file_letters=split_file[-grep("\.|0|1|2|3|4|5|6|7|8|9|\,|;|:|\)|\)",split_file)] letters_in_alphabet=names(table(tolower(split_file_letters))) %in% strsplit("йцукенгшщзхъэждлорпавыфячсмитьбюqwertyuioplkjhgfdsazxcvbnm","")[[1]] if(dim(file_1_row)[2]>1) { file_1_row=f(file,h=T,nrow=1,dec=".") num_row_dot=NULL num_row_comma=NULL for(i in 1:length(file_1_row)){ if(is.numeric(file_1_row[,i])==T){ num_row_dot=append(num_row_dot,i)}} file_1_row2=f(file,h=T,nrow=1,dec=",") for(i in 1:length(file_1_row2)){ if(is.numeric(file_1_row2[,i])==T){ num_row_comma=append(num_row_comma,i)}} decimal=ifelse(length(num_row_dot)>length(num_row_comma),".",",")

if(length(letters_in_alphabet[letters_in_alphabet==T])/length(letters_in_alphabet)>0.9) { return(f(file,h=h,dec=decimal)) } else { return(f(file,h=h,encoding="UTF-8",dec=decimal)) } } } } }

Marshhhhh / first

Scoring #3

1 = Хороший, 2 = Плохой

1 = Хороший, 2 = Плохой

Чистый Герман

1 = Хороший, 2 = Плохой

корреляции, удаление взаимосвязных и незначимых переменных

1 = Хороший, 2 = Плохой

разбиение на тестовую и обучающую выборки

построение модели

валидация: ROC-кривая + тестовая выборка

ROC curve(s) with ROCR

Chupakhin Vladimir (chupvl@gmail.com)

loading ROCR library

loading active compounds, or compounds with label1

loading inactive compounds, or compounds with label2

binding them and converting to matrix because ROCR works with matrix data

because number of the colums should be the same - making additional param

generating classes (1 for active, 0 for inactive, but it can be 1 and -1 - there is no difference)

binding the classes

target_class1 <- target_class[,1]

calculating the values for ROC curve

changing params for the ROC plot - width, etc

plotting the ROC curve

calculating AUC

now converting S4 class to vector

adding min and max ROC AUC to the center of the plot