Open mlandry22 opened 5 years ago
library(data.table)
setwd("/home/mark/competitions/ieee-fraud-detection/models/")
train<-merge(fread("../input/train_transaction.csv"),fread("../input/train_identity.csv"),by="TransactionID",all.x=TRUE)
#test<-merge(fread("../input/test_transaction.csv"),fread("../input/test_identity.csv"),by="TransactionID",all.x=TRUE)
GAP<-train[,max(TransactionDT)-min(TransactionDT)]/10
train[,.N,pmin(9,floor(TransactionDT/GAP))+1]
test[,.N,pmax(12,floor(TransactionDT/GAP))+1]
train[,fold_id:=pmin(9,floor(TransactionDT/GAP))+1]
test[,fold_id:=pmax(12,floor(TransactionDT/GAP))+1]
train[,.N,fold_id]
library(Metrics)
te_list<-list()
for(feature in colnames(test)){
#feature<-"V257"
train[,te1:=mean(isFraud[fold_id<8]),.(card1,get(feature))]
#train[,num1:=frank(get(feature))]
auc_mean<-c(
train=auc(train[fold_id<8,isFraud],train[fold_id<8,te1])
,validation=auc(train[fold_id>=8,isFraud],train[fold_id>=8,te1])
,fold8=auc(train[fold_id==8,isFraud],train[fold_id==8,te1])
,fold9=auc(train[fold_id==9,isFraud],train[fold_id==9,te1])
,fold10=auc(train[fold_id==10,isFraud],train[fold_id==10,te1])
)
#auc_numeric<-c(
# train=auc(train[fold_id<8,isFraud],train[fold_id<8,num1])
# ,validation=auc(train[fold_id>=8,isFraud],train[fold_id>=8,num1])
# ,fold8=auc(train[fold_id==8,isFraud],train[fold_id==8,num1])
# ,fold9=auc(train[fold_id==9,isFraud],train[fold_id==9,num1])
# ,fold10=auc(train[fold_id==10,isFraud],train[fold_id==10,num1])
#)
#auc_numeric<-pmax(auc_numeric,1-auc_numeric)
#te_list[[length(te_list)+1]]<-data.table(feature=feature,split=names(auc_mean),auc_mean=auc_mean,auc_numeric=auc_numeric)
te_list[[length(te_list)+1]]<-data.table(feature=feature,split=names(auc_mean),auc_mean=auc_mean)
if(length(te_list)%%10==0){
print("***********")
print("***********")
print(length(te_list))
print(rbindlist(te_list)[split=="validation"][order(-auc_mean)][1:pmin(10,length(te_list))])
#print(rbindlist(te_list)[split=="validation"][order(-auc_numeric)][1:pmin(10,length(te_list))])
}
}
feature_analysis<-rbindlist(te_list)
dcast(feature_analysis[feature %in% feature_analysis[split=="validation"][order(-auc_mean)][1:10,feature]],feature~split,value.var = "auc_mean"
)[,.(feature,train,validation,fold8,fold9,fold10)][order(-validation)]
train[,.(.N,fraud=mean(isFraud)),fold_id][order(fold_id)]
fwrite(feature_analysis,"feature_analysis_initial_single_card1.csv")
#f1_analysis<-fread("feature_analysis_initial_single.csv")
#dcast(f1_analysis[feature %in% f1_analysis[split=="validation"][order(-auc_mean)][1:10,feature]],feature~split,value.var = "auc_mean"
# )[,.(feature,train,validation,fold8,fold9,fold10)][order(-validation)]
> dcast(feature_analysis[paste(top_feature,feature) %in% feature_analysis[split=="validation"][order(-auc_mean)][1:10,paste(top_feature,feature)]]
+ ,top_feature+feature~split,value.var = "auc_mean"
+ )[,.(top_feature,feature,train,validation,fold8,fold9,fold10)][order(-validation)]
top_feature feature train validation fold8 fold9 fold10
1: card2 V70 0.8080394 0.7724967 0.7885659 0.7584897 0.7693863
2: card2 V69 0.8074520 0.7723277 0.7872140 0.7588878 0.7698420
3: card2 V91 0.8033130 0.7684628 0.7780335 0.7569342 0.7687264
4: card2 V29 0.8039496 0.7681870 0.7782208 0.7560165 0.7690464
5: card2 V90 0.8029745 0.7680992 0.7766026 0.7561603 0.7696218
6: card2 V30 0.8039194 0.7662201 0.7773111 0.7540120 0.7661814
7: card1 V201 0.8893372 0.7648908 0.7780972 0.7497434 0.7656982
8: card2 V201 0.7960752 0.7645828 0.7703538 0.7514570 0.7699774
9: card1 V200 0.8880026 0.7639080 0.7799075 0.7470691 0.7636134
10: card2 V200 0.7938211 0.7624812 0.7699770 0.7492025 0.7664808
> dcast(feature_analysis[paste(top_feature,feature) %in% feature_analysis[
+ split=="validation" & !(top_feature %in% c("card1","card2")) & !(feature %in% c("card1","card2"))][order(-auc_mean)][1:10,paste(top_feature,feature)]]
+ ,top_feature+feature~split,value.var = "auc_mean"
+ )[,.(top_feature,feature,train,validation,fold8,fold9,fold10)][order(-validation)]
top_feature feature train validation fold8 fold9 fold10
1: V74 D3 0.7485099 0.7543073 0.7373149 0.7476448 0.7768922
2: V94 D3 0.7571363 0.7539589 0.7528323 0.7477382 0.7596045
3: V94 V280 0.7537591 0.7538383 0.7542286 0.7557403 0.7487204
4: V94 V294 0.7503306 0.7533290 0.7557583 0.7552299 0.7468087
5: V33 D3 0.7522336 0.7530796 0.7482475 0.7486159 0.7619907
6: V34 D3 0.7526732 0.7526002 0.7508623 0.7463888 0.7601227
7: V33 V280 0.7483028 0.7522751 0.7508301 0.7577256 0.7479001
8: V74 M4 0.7386231 0.7514857 0.7301852 0.7430629 0.7772550
9: V34 V280 0.7485255 0.7506598 0.7515049 0.7551707 0.7450303
10: V33 V294 0.7446141 0.7502535 0.7483305 0.7575803 0.7450177
Single features:
Interactions of card1: