igraph / xdata-igraph

xdata igraph, has been merged into igraph/igraph
GNU General Public License v2.0
18 stars 3 forks source link

Scan statistics #5

Closed gaborcsardi closed 10 years ago

gaborcsardi commented 10 years ago

TODO list on top

US

Gabor, Regarding the enhancement of our "scanstat" code, please find the attached as our working R code that can handle any k (of scank) and more. Please note that when another graph "gp" is used (along with g) as an input, e.g,

local.scan(g,gp,k=1,FUN,...)

it first finds k-neighbors of each vertex v from g, then apply FUN to its induced graph in gp. FYI, we call this case as "them" as opposed to "us" without gp.

In addition to these extension, it also can work on weighted graph.

The file also contains a function called "test.scan" for a sample run, benchmarking, and verification.

I hope you can easily modify your beautiful implementation of "scan1" to incorporate these additional functionalities. Just let us know if you think we need further discussion to clarify some of these points or for anything else.

Thank you for your help! Best,

#' Calculate local scan statistics for each vertex in a graph
#'
#' @param g igraph object or matrix.
#' @param gp igraph object or matrix. It's typically at t=t-1, for "them" statistics.
#' @param k radius of the neighborhood.
#' @param mode one of "in", "out", "total".
#' @param FUN scan function. It can be any vertex stat, e.g., "ecount" or "degree", ...
#' @param weighted whether or not the edges are weighted in locality stat calculation.
#' @return a vector of local scan statistics, one for each vertex in the graph.
#' @export
local.scan <- function(g,gp=NULL,k=1,mode="out",FUN=ecount,weighted=FALSE)
{
    wstat <- function(g,...) {
        A <- get.adjacency(g,attr="weight")
        if (k==0) { # weighted degree
            indeg <- colSums(A)
            outdeg <- rowSums(A)
            if (mode=="in") out <- indeg
            else if (mode=="out") out <- outdeg
            else out <- indeg + outdeg
        } else { # weighted ecount
            out <- sum(A)
            if (mode=="in" | mode=="out") out <- out/2
        }
        return(out)
    }

    if(k<0) stop("Error: k should be a non-negative integer!\n")
    if(k==0) FUN <- degree
    if(weighted) FUN <- wstat

    require(igraph)

    if (is.matrix(g) | is.matrix(gp)) {
        gmode <- ifelse((mode=="out" | mode=="in"),"directed","undirected")
        g <- simplify(graph.adjacency(g,mode=gmode))
        if (!is.null(gp))
            gp <- simplify(graph.adjacency(gp,mode=gmode))
    }

    n <- vcount(g)
    if (is.null(gp)) {
        if (k==0) out <- FUN(g,mode=mode)
        else out <- sapply(graph.neighborhood(g,k,V(g),mode),FUN)
    }
    else { # them
        if (k==0)
            out <- unlist(sapply(1:n,function(x) {
                vid <- unlist(neighborhood(g,k+1,x,mode));
                x.rank <- which(sort(vid)==x);
                FUN(induced.subgraph(gp,vid),mode=mode)[x.rank]}))
        else out <- sapply(V(g),function(x) {
                FUN(induced.subgraph(gp,unlist(neighborhood(g,k,x,mode))))})
    }
    return(out)
}

## for sample run, benchmarking, and verification
test.scan <- function()
{
    require(igraph)

    set.seed(12345)
    n <- 10^3
    p <- 0.1
    g <- erdos.renyi.game(n,p)
    E(g)$weight = sample(ecount(g))
    gp <- erdos.renyi.game(n,p)
    E(gp)$weight = sample(ecount(gp))

    ## us: scan0 == degree
    cat("us: scan0, unweighted: ")
    system.time(s1 <- degree(g))
    system.time(s2 <- local.scan(g,k=0))
    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## PLEASE uncomment below lines once the "scan1" is updated!
    ## us: scan0 == degree, weighted
    cat("us: scan0, weighted: ")
    system.time(s1 <- local.scan(g,k=0,weighted=T))
#    system.time(s2 <- scan1(g,k=0,weighted=T))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## us: scan1
    cat("us: scan1, unweighted: ")
    system.time(s1 <- scan1(g))
    system.time(s2 <- local.scan(g,k=1))
    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## us: scan1, weighted
    cat("us: scan1, weighted: ")
    system.time(s1 <- local.scan(g,k=1,weighted=T))
#    system.time(s2 <- scan1(g,k=1,weighted=T))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## us: scan2
    cat("us: scan2, unweighted: ")
    system.time(s1 <- local.scan(g,k=2))
#    system.time(s2 <- scan1(g,k=2))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## us: scan2, weighted
    cat("us: scan2, weighted: ")
    system.time(s1 <- local.scan(g,k=2,weighted=T))
#    system.time(s2 <- scan1(g,k=2,weighted=T))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ####################################################
    ## them: scan0
    cat("them: scan0, unweighted: ")
    system.time(s1 <- local.scan(g,gp,k=0))
#    system.time(s2 <- scan1(g,gp,k=0))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## them: scan0, weighted
    cat("them: scan0, weighted: ")
    system.time(s1 <- local.scan(g,gp,k=0,weighted=T))
#    system.time(s2 <- scan1(g,gp,k=0,weighted=T))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## them: scan1
    cat("them: scan1, unweighted: ")
    system.time(s1 <- local.scan(g,gp,k=1))
#    system.time(s2 <- scan1(g,gp,k=1))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## them: scan1, weighted
    cat("them: scan1, weighted: ")
    system.time(s1 <- local.scan(g,gp,k=1,weighted=T))
#    system.time(s2 <- scan1(g,gp,k=1,weighted=T))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## them: scan2
    cat("them: scan2, unweighted: ")
    system.time(s1 <- local.scan(g,gp,k=2))
#    system.time(s2 <- scan1(g,gp,k=2))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")

    ## them: scan2, weighted
    cat("them: scan2, weighted: ")
    system.time(s1 <- local.scan(g,gp,k=2,weighted=T))
#    system.time(s2 <- scan1(g,gp,k=2,weighted=T))
#    cat(ifelse(sum(s1-s2)==0,"passed!","failed!"),"\n")
gaborcsardi commented 10 years ago

@youngser : for weighted graphs, FUN is ignored, is this intentional?

gaborcsardi commented 10 years ago

youngser:

Yes, instead "wstat" function is used (included in the code). It's basically "weighted degree" for scan0 and "weighted ecount" for else. Please fix/improve it if necessary.

I wrote it only because I couldn't find those functions in igraph. If you have already them (hopefully as fast as existing "degree" and "ecount", then please use them accordingly.

Hope it makes sense.

gaborcsardi commented 10 years ago

How about these defaults?

local.scan <- function(graph.us, graph.them=NULL, k=1, FUN=NULL,
                       weighted=FALSE, mode=c("out", "in", "all", "total"), ...) {

  if (is.null(FUN)) {
    if (k == 0 && weighted) {
      FUN <- function(g) graph.strength(mode=mode, ...)
    } else if (k == 0 && ! weighted) {
      FUN <- function(g) degree(mode=mode, ...)
    } else if (k > 0 && weighted) {
      FUN <- function(g) sum(E(g)$weight)
    } else { # k > 0 && ! weighted
      FUN <- ecount
    }
  }

Then it is possible to supply an arbitrary function for each case, and we also have reasonable defaults.

gaborcsardi commented 10 years ago

youngser:

Looks good to me!

gaborcsardi commented 10 years ago

Btw. is the local.scan name good, or scanstat is better? I think local.scan is more descriptive, but scanstat sounds better.

gaborcsardi commented 10 years ago

youngser:

I prefer local.scan! To us, scanstat is a global thingy, after taking min or max of local.scan...

HEng Wang:

Hello Gabor,

For this function, i think local.scan is more accurate because the definition of scanstat is the maximum of local.scan(v) over all vertices v.

Best, Heng

Carey:

right. (from spatial statistics, eg) the (global) scan statistic is the max of (local) locality statistics.

gaborcsardi commented 10 years ago

OK, have some faster R code now, as a next step, before moving to C. Before that I'll set up a benchmarking environment, to make it easy to run benchmarks for two git trees against each other, see #6.

gaborcsardi commented 10 years ago

@youngser: your function gives me:

g <- graph.formula( a -+ b +-+ c, a -+c )
E(g)$weight <- 1:4
local.scan(g, k=1, weighted=TRUE)

Why divided by two? This is an error, right? I believe the right answer should be 10, 7, 7.

Thanks.

gaborcsardi commented 10 years ago

I believe so! Yes, it's an error! It should be the undirected case, that is,

    if (mode!="in" & mode!="out") out <- out/2

right?

Thanks,

  • Youngser

You also need to check that the graph is actually directed, because otherwise mode does not apply and is (should be) ignored.

Right!

gaborcsardi commented 10 years ago

I'll close this for now, as I understand everything that was needed, is done. Please open it if you need something else.