trinker / qdap

Quantitative Discourse Analysis Package: Bridging the gap between qualitative data and quantitative analysis
http://cran.us.r-project.org/web/packages/qdap/index.html
175 stars 44 forks source link

Add distance functions for distance between codes #72

Closed trinker closed 12 years ago

trinker commented 12 years ago

Work in progress:

# library(devtools)
# install_github("qdap", "trinker")
p_load(qdap)

head(LIST[[1]])
x <- LIST[[1]][, 6]
y <- LIST[[1]][, 8]
proximaty <- function(x, y) {
    act <- which(x != 0)
    len <- length(act)
    comp <- which(y != 0)
    nums <- sapply(list(x, y), is.numeric)
    if (sum(nums) != 2) {
        stop("all variables must be numeric")
    }    
    if (identical(act, integer(0)) | identical(comp, integer(0))) {
        return(0)
    }
    lapply(1:len, function(i) {
            ming <- act[i] - comp
            min(abs(ming))
        }
    )
}

head(LIST[[1]])

cm_describe <- function(code, grouping.var = NULL) {
    if (!is.null(grouping.var)) {
        if (is.list(grouping.var)) {
          m <- unlist(as.character(substitute(grouping.var))[-1])
          m <- sapply(strsplit(m, "$", fixed = TRUE), 
                      function(x) x[length(x)])
          NAME <- paste(m, collapse = "&")
        } else {
          G <- as.character(substitute(grouping.var))
          NAME <- G[length(G)]
        }
        cname <- strsplit(as.character(substitute(code)), "&")
        NAME <- paste0(cname[[length(cname)]], "&", NAME)
        group.var <- if (is.list(grouping.var) & length(grouping.var)>1) {
          apply(data.frame(grouping.var), 1, function(x){
            if (any(is.na(x))){
              NA
            } else {
              paste(x, collapse = ".")
            }
          }
          )
        } else {
          grouping.var
        }
        v <- do.call(data.frame, rle(paste2(list(code, group.var))))
    } else {
        v <- do.call(data.frame, rle(code))
    }
    v$end<- cumsum(v[, 1])
    colnames(v)[1] <- "duration"
    v$start <- c(0, c(v$end)[-c(length(v$end))])
    v$center <- (v$start + v$end)/2
    v2 <- v[, c("values", "center", "duration", "start", "end")]
    if (!is.null(grouping.var)) {
        nl <- if (is.list(grouping.var)) {
            grouping.var 
        } else { 
            list(grouping.var)
        }
        L2 <- lapply(1:(length(nl) + 1), function(i) {
            x <- strsplit(as.character(v2[, "values"]), "\\.")
            sapply(1:length(x), function(j)x[[j]][i])
            }
        )
        v3 <- data.frame(do.call(cbind, L2))
        colnames(v3) <- unlist(strsplit(NAME, "\\&"))
        v2 <- data.frame(v3, v2[, -1, drop=FALSE])      
    } else {
        cname <- strsplit(as.character(substitute(code)), "&")
        colnames(v2)[1] <- cname[[length(cname)]]
    }
    return(v2)
}

#paste with a grouping variable

rle(c("a", "a", "b", "c", "a"), list()
with(LIST[[1]], cm_describe(TSN, list(tchr, time)))
cm_describe(LIST[[1]]$TSN, LIST[[1]]$person)
cm_describe(LIST[[1]]$TSN)
colsplit2df(with(LIST[[1]], cm_describe(TSN, person)))

#next is to make a distance function
#may not be able to supply the grouping vars to cm_describe directly
cm_bidist <- function(code_x, code_y, grouping.var = NULL) {
    x <- cm_describe(code_x, grouping.var)
    x <- x[as.numeric(as.character(x[, "code_x"])) > 0, ]
    y <- cm_describe(code_y, grouping.var)
    y <- y[as.numeric(as.character(y[, "code_y"])) > 0, ]
    Dnc <- sapply(1:nrow(x), function(i) {
            yind <- 1:nrow(y)
            if (sum(y[, "start"] >= x[i, "start"] & y[, "start"] <= x[i, "end"]) > 0 |
                sum(y[, "end"] >= x[i, "start"] & y[, "end"] <= x[i, "start"]) > 0 |
                sum(sapply(yind, function(j) {
                        y[j, "start"] < x[i, "start"] & y[j, "end"] > x[i, "end"]
                    }
                )) > 0) {
                return(0)
            }
            sdif <- c(y[, "start"], y[, "end"]) - x[i, "start"]
            edif <- c(y[, "start"], y[, "end"]) - x[i, "end"]
            min(abs(c(sdif, edif)))
        }
    )
    Dc <- sapply(1:nrow(x), function(i) {
        FUN <- function(xstart, xend, ystart){
             max(0, min(ystart[ystart > xstart] - xend))
        }
        suppressWarnings(FUN(x[i, "start"], x[i, "end"], y[, "start"]))
        }
    )
    Dc[is.infinite(Dc)] <- NA
    list(associated_distance = Dnc, mean.sd_assoc_dist = c(mean(Dnc), sd(Dnc)), 
        causal_distance = Dc, mean.sd_causal_dist = c(mean(na.omit(Dc)), sd(na.omit(Dc))))
}

c(3, 4, 6) - c(1, 3)
with(LIST[[1]], cm_bidist(TSN, MOTSN, list(tchr, time)))
with(LIST[[1]], cm_bidist(MOTSN, TSN, list(tchr, time)))

#outers the above 
#may have to use diags to put 0s in diagonals
cm_distance <- function(..., grouping.var = NULL, causal = FALSE)

head(LIST[[1]], 400)

str(LIST[[1]])

#Dason replaced this line
#=========================================
#             if (sum(y[, "start"] > x[i, "start"] & y[, "start"] <= x[i, "end"]) > 0) {
#                 return(0)
#             }
#             inds <- y[, "start"] > x[i, "end"]
#             vals <- y[inds, "start"] - x[i, "end"]
#             if (sum(vals[vals >= 0]) == 0) {
#                 return(NA)  #there should be a 
#             } else {
#                 min(vals[vals > 0])
#             }
         }

with(LIST[[1]], cm_bidist(TSN, MOTSN))

set.seed(12)
fart <- rbinom(50, 1, .5)
burp <- rbinom(50, 1, .3)

cm_bidist(fart, burp)
cm_bidist(burp, fart)

a <- cm_describe(fart)
b <- cm_describe(burp)

a <- a[a[, 1]==1,]
b <- b[b[, 1]==1,]

dat <- data.frame(obs=rep(c('fart', 'burp'), c(nrow(a), nrow(b))), 
    rbind(a[, -1], b[, -1]))
rownames(dat) <- NULL
gantt_wrap(dat[, -2], obs,  x.ticks=T, rm.horiz.lines = F, 
    minor.line.freq = 1, major.line.freq = 5, sig.dig.line.freq=1)
trinker commented 12 years ago

Right now v.outer works with matrix data.frame or unequal length list.

p_load(qdap)

proximaty <- function(x, y) {
    act <- which(x != 0)
    len <- length(act)
    comp <- which(y != 0)
    nums <- sapply(list(x, y), is.numeric)
    if (sum(nums) != 2) {
        stop("all variables must be numeric")
    }    
    if (identical(act, integer(0)) | identical(comp, integer(0))) {
        return(0)
    }
    lapply(1:len, function(i) {
            ming <- act[i] - comp
            min(abs(ming))
        }
    )
}

cm_describe <- function(code, grouping.var = NULL) {
    if (!is.null(grouping.var)) {
        if (is.list(grouping.var)) {
          m <- unlist(as.character(substitute(grouping.var))[-1])
          m <- sapply(strsplit(m, "$", fixed = TRUE), 
                      function(x) x[length(x)])
          NAME <- paste(m, collapse = "&")
        } else {
          G <- as.character(substitute(grouping.var))
          NAME <- G[length(G)]
        }
        cname <- strsplit(as.character(substitute(code)), "&")
        NAME <- paste0(cname[[length(cname)]], "&", NAME)
        group.var <- if (is.list(grouping.var) & length(grouping.var)>1) {
          apply(data.frame(grouping.var), 1, function(x){
            if (any(is.na(x))){
              NA
            } else {
              paste(x, collapse = ".")
            }
          }
          )
        } else {
          grouping.var
        }
        v <- do.call(data.frame, rle(paste2(list(code, group.var))))
    } else {
        v <- do.call(data.frame, rle(code))
    }
    v$end<- cumsum(v[, 1])
    colnames(v)[1] <- "duration"
    v$start <- c(0, c(v$end)[-c(length(v$end))])
    v$center <- (v$start + v$end)/2
    v2 <- v[, c("values", "center", "duration", "start", "end")]
    if (!is.null(grouping.var)) {
        nl <- if (is.list(grouping.var)) {
            grouping.var 
        } else { 
            list(grouping.var)
        }
        L2 <- lapply(1:(length(nl) + 1), function(i) {
            x <- strsplit(as.character(v2[, "values"]), "\\.")
            sapply(1:length(x), function(j)x[[j]][i])
            }
        )
        v3 <- data.frame(do.call(cbind, L2))
        colnames(v3) <- unlist(strsplit(NAME, "\\&"))
        v2 <- data.frame(v3, v2[, -1, drop=FALSE])      
    } else {
        cname <- strsplit(as.character(substitute(code)), "&")
        colnames(v2)[1] <- cname[[length(cname)]]
    }
    return(v2)
}

#next is to make a distance function
#may not be able to supply the grouping vars to cm_describe directly
cm_bidist <- function(code_x, code_y, grouping.var = NULL) {
    x <- cm_describe(code_x, grouping.var)
    x <- x[as.numeric(as.character(x[, "code_x"])) > 0, ]
    y <- cm_describe(code_y, grouping.var)
    y <- y[as.numeric(as.character(y[, "code_y"])) > 0, ]
    Dnc <- sapply(1:nrow(x), function(i) {
            yind <- 1:nrow(y)
            if (sum(y[, "start"] >= x[i, "start"] & y[, "start"] <= x[i, "end"]) > 0 |
                sum(y[, "end"] >= x[i, "start"] & y[, "end"] <= x[i, "start"]) > 0 |
                sum(sapply(yind, function(j) {
                        y[j, "start"] < x[i, "start"] & y[j, "end"] > x[i, "end"]
                    }
                )) > 0) {
                return(0)
            }
            sdif <- c(y[, "start"], y[, "end"]) - x[i, "start"]
            edif <- c(y[, "start"], y[, "end"]) - x[i, "end"]
            min(abs(c(sdif, edif)))
        }
    )
    Dc <- sapply(1:nrow(x), function(i) {
        FUN <- function(xstart, xend, ystart){
             max(0, min(ystart[ystart > xstart] - xend))
        }
        suppressWarnings(FUN(x[i, "start"], x[i, "end"], y[, "start"]))
        }
    )
    Dc[is.infinite(Dc)] <- NA
    list(associated_distance = Dnc, mean.sd_assoc_dist = c(mean(Dnc), sd(Dnc)), 
        causal_distance = Dc, mean.sd_causal_dist = c(mean(na.omit(Dc)), sd(na.omit(Dc))))
}

#outers the above 
#may have to use diags to put 0s in diagonals
#cm_distance <- function(..., grouping.var = NULL, causal = FALSE)

set.seed(12)
fart <- rbinom(50, 1, .5)
burp <- rbinom(50, 1, .3)
sneeze <- rbinom(50, 1, .5)
hiccup <- rbinom(50, 1, .3)

cm_bidist(fart, burp)
cm_bidist(burp, fart)[[2]][1]

L2 <- data.frame(fart, burp, sneeze, hiccup)
v.outer(L2, cdm1)

L <- list(fart, burp, sneeze, hiccup)  #make it work with lists
names(L) <- qcv(fart, burp, sneeze, hiccup)
v.outer(L, cdm1)

cdm1m <- function(x, y) cm_bidist(x, y)[[2]][1]
v.outer(L2, cdm1m)

cdm1s <- function(x, y) cm_bidist(x, y)[[2]][2]
v.outer(L2, cdm1s)

cdm1cm <- function(x, y) cm_bidist(x, y)[[4]][1]
v.outer(L2, cdm1cm)

cdm1cs <- function(x, y) cm_bidist(x, y)[[4]][2]
v.outer(L2, cdm1cs)
trinker commented 12 years ago

Uploaded cm_bidist, cm_describe and proximity: Here's the latest script:

p_load(qdap)

set.seed(12)
fart <- rbinom(50, 1, .5)
burp <- rbinom(50, 1, .3)
sneeze <- rbinom(50, 1, .5)
hiccup <- rbinom(50, 1, .3)

cm_bidist(fart, burp)
cm_bidist(burp, fart)[[2]][1]

cdm1 <- function(x, y) cm_bidist(x, y)[[2]][1]
(L2 <- data.frame(fart, burp, sneeze, hiccup))
v.outer(L2, cdm1)

(L <- list(fart, burp, sneeze, hiccup)  #make it work with lists)
names(L) <- qcv(fart, burp, sneeze, hiccup)
v.outer(L, cdm1)

cdm1m <- function(x, y) cm_bidist(x, y)[[2]][1]
v.outer(L2, cdm1m)

cdm1s <- function(x, y) cm_bidist(x, y)[[2]][2]
v.outer(L2, cdm1s)

cdm1cm <- function(x, y) cm_bidist(x, y)[[4]][1]
v.outer(L2, cdm1cm)

cdm1cs <- function(x, y) cm_bidist(x, y)[[4]][2]
v.outer(L2, cdm1cs)

#the start:
cm_distance <- function(..., grouping.var = NULL, causal = FALSE, digits = 3)

#has to grab the start and end columns and create vectors that can be fed to bidist
#create a list of mean, sd and combined and a class of cm.distance, print the combined
trinker commented 12 years ago

Create a function that takes two columns (start and end times) and makes a dummy coded vector representing this:

cm_se2vect <- function(start, end, n.words = NULL) {
    dat <- data.frame(start =start, end = end)
    if (is.null(n)){
        n <- max(dat$end) - 1
    }
    x <- rep(0, n)
    x[unlist(lapply(1:nrow(dat), function(i) dat$start[i]:(dat$end[i]-1)))] <- 1
    x
}

dat <- data.frame(start=c(2, 7, 12, 15), end=c(3, 10, 13, 17))
cm_se2vect(dat$start, dat$end)
trinker commented 12 years ago

put in a safety for if start and end are passed that increase and decrease that throws up a stop

trinker commented 12 years ago

Current development of cm_distance function:

p_load(qdap)

 foo <- list(
     AA = qcv(terms='1'),
     BB = qcv(terms='1:2, 3:10'),
     CC = qcv(terms='1:9, 100:150')
 )

 foo2  <- list(
     AA = qcv(terms='40'),
     BB = qcv(terms='50:90'),
     CC = qcv(terms='60:90, 100:120, 150'),
     DD = qcv(terms='')
 )
 dat <- cm_range2long(foo, foo2, v.name = "time")

cm_distance <- function(dataframe, time.var = NULL, code.var = "code",
    causal = FALSE, start.var = "start", end.var = "end", digits = 3) {
    if (!is.null(time.var)) {
        L1 <- split(dataframe, dataframe[, time.var])
    } else {
        L1 <- dataframe
    }
    L2 <- lapply(L1, function(x) split(x, x[, code.var]))   
    NMS <- lapply(L2, names)
    NMS <- unlist(lapply(seq_along(NMS), function(i) {
        paste(names(NMS)[i], NMS[[i]], sep=".")
    }))
    lens <- sapply(L2, length)
    mlens <- sapply(L2, function(x) {
        max(do.call(rbind, x)[, end.var])
    })
    nt <- rep(mlens, lens)
    v <- unlist(L2, recursive=FALSE)
    L3 <- lapply(seq_along(v), function(i) {
        cm_se2vect(v[[i]][, start.var], v[[i]][, end.var])
    })
    L4 <- lapply(seq_along(L3), function(i){
        c(L3[[i]], rep(0, nt[i] - length(L3[[i]])))
    })
    dat <- do.call(cbind, L4)  
    colnames(dat) <- NMS
    #various parmeters for casual here (sd, mean, pasted, and mean/sd provided
    #make a print method (mean/sd)
    cdm1 <- function(x, y) cm_bidist(x, y)[[2]][1]
    v.outer(dat, cdm1, digits = digits)

    #lapply(L1, )
}

cm_distance(dat, time.var = "time")
trinker commented 12 years ago

finished