benjamin-allevius / scanstatistics

An R package for space-time anomaly detection using scan statistics.
GNU General Public License v3.0
49 stars 10 forks source link

top clusters #5

Open zaima1988 opened 5 years ago

zaima1988 commented 5 years ago

Hi Ben, earlier, I've tried to get top clusters using this syntax top10 <- top_clusters(res, zones, k = 10, overlapping = FALSE) top10

but the result (top10), all clusters have gumble p value = 0, and altough I set overlapping = FALSE, the result is still overlapping. And then when I read your updates to top clusters and documentation, the result of top clusters are different than first syntax and all of MLC p value is 0.01. Beside that, when I use the syntax for show subregion in top10 cluster in Flexible Zones, there was error Error: object of type 'closure' is not subsettable What should I do? Thank you very much

Here the First syntax knn_mat <- coords_to_knn(unique(data[,6:7]), 12) zones <- knn_zones(knn_mat)

t<-length(unique(data$year)) m<-length(unique(data$subregion)) counts<-matrix(data$case,nrow=t, ncol=m) View(counts) population<-matrix(data$population,nrow=t, ncol=m)

res <- scan_pb_poisson(counts = counts, zones = zones, population = population, n_mcsim = 99, max_only = FALSE)

res$MLC

hotspot<-unique(data$id)[res$MLC$locations] hotspot

TOP Cluster

top10 <- top_clusters(res, zones, k = 10, overlapping = FALSE) top10

show subregion in top10 cluster

j=1 clustersubregion<-list() for(i in top10$zone){ clustersubregion[[j]]<-unique(data$id)[zones[[i]]] j<-j+1 } clustersubregion

Second Syntax knn_mat <- coords_to_knn(unique(data[,6:7]), 12) zones <- knn_zones(knn_mat)

t<-length(unique(data$year)) m<-length(unique(data$subregion)) counts<-matrix(data$case,nrow=t, ncol=m)

View(counts)

population<-matrix(data$population,nrow=t, ncol=m)

res <- scan_pb_poisson(counts = counts, zones = zones, population = population, n_mcsim = 99, max_only = FALSE)

res$MLC

hotspot<-unique(data$id)[res$MLC$locations] hotspot

tOP CLUSTER P VALUE

mc_pvalue <- function(observed, replicates) { if (length(replicates) == 0) { return(NULL) } else { f <- Vectorize( function(y) { (1 + sum(replicates > y)) / (1 + length(replicates)) } )

return(f(observed))

} }

gumbel_pvalue <- function(observed, replicates, method = "ML", ...) { if (length(replicates) < 2) { stop("Need at least 2 observations to fit Gumbel distribution.") }

Fit Gumbel distribution to Monte Carlo replicates

gumbel_mu <- NA gumbel_sigma <- NA if (method == "ML") { gum_fit <- gum.fit(replicates, show = FALSE, ...) gumbel_mu <- gum_fit$mle[1] gumbel_sigma <- gum_fit$mle[2] } else { gumbel_sigma <- sqrt(6 var(replicates) / pi^2) gumbel_mu <- mean(replicates) + digamma(1) gumbel_sigma }

pvalue <- pgumbel(observed, gumbel_mu, gumbel_sigma, lower.tail = FALSE)

return(list(pvalue = pvalue, gumbel_mu = gumbel_mu, gumbel_sigma = gumbel_sigma)) }

mtop_clusters <- function(x, zones, k = 10, overlapping = FALSE, gumbel = FALSE, alpha = NULL, ...) { k <- min(k, nrow(x$observed)) if (overlapping) { return(x$observed[seq_len(k), ]) } else { row_idx <- c(1L, integer(k - 1)) seen_locations <- zones[[x$observed[1,]$zone]] n_added <- 1L i <- 2L while (n_added < k && i <= nrow(x$observed)) { zone <- x$observed[i, ]$zone if (zone != x$observed[i-1, ]$zone && length(intersect(seen_locations, zones[[zone]])) == 0) { seen_locations <- c(seen_locations, zones[[zone]]) n_added <- n_added + 1L row_idx[n_added] <- i } i <- i + 1L } res <- x$observed[row_idx[row_idx > 0], ]

if (nrow(x$replicates) > 0) {
  res$MC_pvalue <- mc_pvalue(res$score, x$replicates$score)

  if (gumbel) {
    res$Gumbel_pvalue <- gumbel_pvalue(res$score, 
                                       x$replicates$score)$pvalue
  }
  if (!is.null(alpha) && alpha >= 0 && alpha <= 1) {
    res$critical_value <- quantile(x$replicates$score, 1 - alpha)
  }
}
return(res)

} }

top10 <- mtop_clusters(res, zones, k = 10, overlapping = FALSE, gumbel=FALSE,alpha=0.05) top10

show subregion in top10 cluster

j=1 clustersubregion<-list() for(i in top10$zone){ clustersubregion[[j]]<-unique(data$id)[zones[[i]]] j<-j+1 } clustersubregion