SixiangHu / DataMan

R package for data cleaning, preliminary data analysis and modeling assessing with visualisation.
3 stars 0 forks source link

view the feature without request of the model #16

Closed SixiangHu closed 9 years ago

SixiangHu commented 9 years ago
dataPlot <- function(data,xvar,yvar,weights=NULL,byvar=NULL,newGroupNum=NULL,interactive=FALSE,...){

  opts.list<-list(...)
  opts <- names(list(...))
  if("xlim" %in% opts) xlim<-opts.list$xlim
  if("ylim" %in% opts) ylim<-opts.list$ylim
  if("binwidth" %in% opts) binwidth<-opts.list$binwidth else binwidth <- 1

  # Error Trapping
  if( is.null(data) ) stop("data set provided is null.")
  if( is.null(xvar) ) stop("X variable provided is null.") 
  if( is.null(yvar) ) stop("Responce variable provided is null.")

  if (is.character(xvar)) {
    if(!xvar %in% colnames(data)) stop(paste("xvar variable (",xvar,") cannot be found.",""))
    x <- data[,which(names(data)==xvar)]
    xname <- xvar
  }
  else if (is.integer(xvar)) { 
    x <- data[,xvar]
    xname <- names(data)[xvar]
  }
  else stop ("xvar provided is either a character (variable name) or integer (position of the variable).")

  if (is.character(yvar)) {
    if(!yvar %in% colnames(data)) stop(paste("yvar variable (",yvar,") cannot be found.",""))
    yname <- yvar
    y <- data[,which(names(data)==yvar)]
  }
  else if (is.integer(yvar)) {
    y <- data[,yvar]
    yname <- names(data)[yvar]
  }
  else stop ("yvar provided is either a character (variable name) or integer (position of the variable).")

  if( !is.null(byvar) ){
    if (is.character(byvar)){
      if(!byvar %in% colnames(data) ) stop(paste("xvar variable (",byvar,") cannot be found.",""))
      by <- data[,which(names(data)==byvar)]
      byname <- byvar
    }
    else if (is.integer(byvar)) {
      xname <- names(data)[byvar]
      by <- as.character(data[,byvar])
    }
    else by <- NULL
  }
  else by <- NULL

  if( !is.null(weights) ){
    wname = "w"
    if (is.character(weights)){
      if(!weights %in% colnames(data) ) stop(paste("xvar variable (",weights,") cannot be found.",""))
      w <- data[,which(names(data)==weights)]
    }
    else if (is.integer(weights) && length(weights)==1) {
      w <- data[,weights]
    }
    else if (is.integer(weights) && length(weights)>1){
      if ( dim(data)[1] != length(weights) ) stop ("Length of weights is not the same as dimension of the data provided.")
      w <- weights
    }
  }
  else w <- rep(1,dim(data)[1])

  #New Group for xvar which has too much levels.
  if ( (is.numeric(x) || is.integer(x) ) && nlevels(as.factor(x))>100 ) {
    if ( is.null(newGroupNum) ) newGroupNum <- 10

    new_band <- seq(min(x),max(x),length.out=newGroupNum)
    x <- cut(x,new_band,include.lowest = TRUE)
  }

  #New Group for byvar which has too much levels.
  if(!is.null(by)){
    if ( (is.numeric(by) || is.integer(by)) && nlevels(as.factor(by))>100 ) {
      if ( is.null(newGroupNum) ) newGroupNum <- 10

      new_band <- seq(min(by),max(by),length.out=newGroupNum)
      by <- cut(by,new_band,include.lowest = TRUE)
    }
  }

  #Data for plot
  if (is.null(by)) {
    data.plot <- data.table::as.data.table(as.data.frame(cbind(x=x,y=y,w=w),stringsAsFactors=FALSE))
    data.table::setkey(data.plot,x)

    data.plot <- data.plot[,lapply(.SD,as.numeric),by=x,.SDcols=c("y","w")]
    data.agg <- as.data.frame(data.plot[,lapply(.SD,weighted.mean,w=w),by=x,.SDcols=c("y","w")],row.names=c("xvar","weights","observed"))
    data.freq <- as.data.frame(data.plot[,sum(w),by=x][,freq:=V1/sum(V1)])

    data.melt <- reshape2::melt(data.agg[,-3],id=c("x"))

    #line graph

    strV1 <- paste("Observation Analysis on: ",xname)

    gLine <- ggplot2::ggplot(data=data.melt,aes(x=x,y=value)) + 
      ggplot2::geom_line(size=1,colour= "magenta3") + ggplot2::geom_point(size=4,fill="white",shape=22)
    if(("xlim" %in% opts) && is.numeric(data.melt$x)) gLine <-gLine + ggplot2::scale_x_continuous(limits=xlim)
    else if(("xlim" %in% opts) && !is.numeric(data.melt$x)) gLine <-gLine + ggplot2::scale_x_discrete(limits=xlim)
    else if(("xlim" %in% opts) && is(data.melt[,"x"],"Date")) gLine <-gLine + ggplot2::scale_x_date(label=date_format("%y%m"),limits=xlim)
    if("ylim" %in% opts) gLine <-gLine + ggplot2::ylim(ylim)
    gLine <- gLine + ggplot2::xlab("") + ggplot2::ylab(yname)+ ggplot2::ggtitle(strV1)+ theme_mp_line
    if(nlevels(as.factor(data.melt$x))>25) gLine <- gLine + ggplot2::theme(axis.text.x = element_text(angle = 90,hjust=0.5,vjust=0.5))

    #histogram graph
    ghist <- ggplot2::ggplot(data=data.freq,aes(x=x,y=freq))+
      ggplot2::geom_histogram(stat="identity",colour="black",fill="yellow")
    if(("xlim" %in% opts) && is.numeric(data.melt$x)) ghist <-ghist + ggplot2::scale_x_continuous(limits=xlim)
    else if(("xlim" %in% opts) && !is.numeric(data.melt$x)) ghist <-ghist + ggplot2::scale_x_discrete(limits=xlim)
    ghist <- ghist + ggplot2::ylab("percent (%)")+
      ggplot2::scale_y_continuous(labels = percent)+
      ggplot2::xlab("")+ theme_mp_hist

    gridExtra::grid.arrange(gLine,ghist,ncol=1,nrow=2,heights=c(4,1))

    if (interactive) {
      df <- data.frame(data.agg,freq=data.freq$freq)
      gvisSingleOptionList <- list(pointSize=8,
                                   series="[
                                   {targetAxisIndex:0, type:'line',color:'magenta',pointShape: 'square'},
                                   {targetAxisIndex:1, type:'bars',color:'yellow'}]",
                                   crosshair="{trigger:'both'}",
                                   hAxis.title=xname,
                                   theme="maximized",
                                   title=paste0("Observation analysis on ",xname, " Observed"),
                                   vAxes="{1:{format:'##.#%',maxValue:1}}",
                                   explorer="{ actions: ['dragToZoom', 'rightClickToReset'],keepInBounds: true }",
                                   chartArea="{width:'90%',height:'90%'}",
                                   height=750)

      plot(googleVis::gvisComboChart(df,xvar="x",yvar="y",options=gvisSingleOptionList))

    }
  }
  else{
    data.plot <- data.table::as.data.table(as.data.frame(cbind(x=x,y=y,w=w,by=by),stringsAsFactors=FALSE))
    data.table::setkey(data.plot,x,by)

    data.plot <- data.plot[,lapply(.SD,as.numeric),by=list(x,by),.SDcols=c("y","w")]

    data.agg <- as.data.frame(data.plot[,lapply(.SD,weighted.mean,w=w),by=list(x,by),.SDcols=c("y","w")],row.names=c("xvar","by","weights","observed"))

    data.freq <- as.data.frame(data.plot[,sum(w),by=list(x,by)][,freq:=V1/sum(V1)])

    #line graph
    gLine1 <- ggplot2::ggplot(data=data.agg,aes(x=x,y=y,group=factor(by),colour=factor(by)))+
      ggplot2::geom_line(size=1) + ggplot2::geom_point(size=4,fill="white")
    if("xlim" %in% opts) gLine1 <-gLine1 + xlim(xlim)
    if("ylim" %in% opts) gLine1 <-gLine1 + ylim(ylim)
    gLine1 <- gLine1+ggplot2::xlab("")+ggplot2::ylab(yname)+ ggplot2::ggtitle(paste("Observation Analysis on: ",xname," by ",byname))+
      theme_mp_line
    if(nlevels(as.factor(data.agg$x))>25) gLine1 <- gLine1 + ggplot2::theme(axis.text.x = element_text(angle = 90,hjust=0.5,vjust=0.5))

    #histogram graph
    ghist <- ggplot2::ggplot(data=data.freq,aes(x=x,y=freq,fill=factor(by)))+ ggplot2::geom_histogram(stat="identity",binwidth=1)
    if("xlim" %in% opts) ghist <-ghist + ggplot2::xlim(xlim)
    ghist <- ghist + ggplot2::xlab("")+ ggplot2::ylab("percent (%)")+ ggplot2::scale_y_continuous(labels = percent) + 
      theme_mp_hist+ theme(legend.position="none")

    gridExtra::grid.arrange(gLine1,ghist,ncol=1,nrow=2,heights=c(4,1))
  }
}