netZoo / netZooR

netZooR is a network biology package implemented in R.
https://netzoo.github.io/
GNU General Public License v3.0
100 stars 39 forks source link

Promblem when running ALPACA #295

Closed lijinw closed 11 months ago

lijinw commented 12 months ago

Hi @marouenbg!

I'm currently running Alpaca using the regNet network generated by Panda. However, I've noticed that the communities obtained through Alpaca are not stable. Specifically, I initially obtained 16 communities, but in subsequent runs, I got 15 communities, and the composition of these communities also varied when I re-ran the code.

Is there a method, similar to using 'set.seed,' that can be employed to ensure more stable and consistent results in this case? Your guidance on this matter would be greatly appreciated.

library(netZooR)

Loading required package: igraph

Warning: package 'igraph' was built under R version 4.3.1

Attaching package: 'igraph'

The following objects are masked from 'package:stats':

decompose, spectrum

The following object is masked from 'package:base':

union

Loading required package: reticulate

Warning: package 'reticulate' was built under R version 4.3.1

Loading required package: pandaR

Loading required package: Biobase

Loading required package: BiocGenerics

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:igraph':

normalize, path, union

The following objects are masked from 'package:stats':

IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

anyDuplicated, aperm, append, as.data.frame, basename, cbind,

colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,

get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,

match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,

Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,

table, tapply, union, unique, unsplit, which.max, which.min

Welcome to Bioconductor

Vignettes contain introductory material; view with

'browseVignettes()'. To cite Bioconductor, see

'citation("Biobase")', and for packages 'citation("pkgname")'.

Loading required package: yarn

Setting options('download.file.method.GEOquery'='auto')

Setting options('GEOquery.inmemory.gpl'=FALSE)

Loading required package: matrixcalc

Attaching package: 'matrixcalc'

The following object is masked from 'package:igraph':

%s%

setwd("E:\nBox\Lijin\Adipose_network_study\lijin\Netzoo\alpaca\sub_panda\new") long_regNet.obesity <- read.table("long_regNet.obesity_0906.txt",header=TRUE) dim(long_regNet.obesity) #8940652 3

[1] 8940652 3

head(long_regNet.obesity)

Transcript_name Gene_name Value

1 AHR ENSG00000000003 -0.60414255

2 AIRE ENSG00000000003 -0.04268767

3 ALX1 ENSG00000000003 0.22438224

4 ALX3 ENSG00000000003 0.38837990

5 ALX4 ENSG00000000003 0.28476211

6 AR ENSG00000000003 -0.71595508

long_regNet.lean <- read.table("long_regNet.lean_0906.txt",header=TRUE) dim(long_regNet.lean) #8940652 3

[1] 8940652 3

head(long_regNet.lean)

Transcript_name Gene_name Value

1 AHR ENSG00000000003 -0.7556592

2 AIRE ENSG00000000003 0.1703895

3 ALX1 ENSG00000000003 0.4482405

4 ALX3 ENSG00000000003 0.6122047

5 ALX4 ENSG00000000003 0.5033211

6 AR ENSG00000000003 -0.8505601

data = cbind(long_regNet.lean,long_regNet.obesity[ , 3] ) dim(data)

[1] 8940652 4

colnames(data)[3:4] = c("lean", "obesity") head(data)

Transcript_name Gene_name lean obesity

1 AHR ENSG00000000003 -0.7556592 -0.60414255

2 AIRE ENSG00000000003 0.1703895 -0.04268767

3 ALX1 ENSG00000000003 0.4482405 0.22438224

4 ALX3 ENSG00000000003 0.6122047 0.38837990

5 ALX4 ENSG00000000003 0.5033211 0.28476211

6 AR ENSG00000000003 -0.8505601 -0.71595508

simp.alp <- alpaca(data,NULL,verbose=FALSE)

[1] "Detecting communities in control network..."

Weights detected. Building condor object with weighted edges.

[1] "modularity of projected graph 0.239174499818219"

[1] "Q = 0.239454012016375"

[1] "Q = 0.239454012016375"

[1] "Computing differential modularity matrix..."

[1] "Computing differential modules..."

[1] "Merging 14527 communities"

[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] "Merging 28 communities"

[1] 1

[1] 2

[1] 3

[1] "Merging 15 communities"

[1] 1

[1] "Computing node scores..."

[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

[1] 13

[1] 14

[1] 15

head(simp.alp)

class(simp.alp)

[1] "list"

simp.alp.list1 <- simp.alp[[1]] simp.memb <- as.vector(simp.alp.list1) names(simp.memb) <- names(simp.alp.list1)

dat1 = as.matrix(simp.memb) head(dat1)

[,1]

ALX1_A 1

ALX3_A 1

ALX4_A 1

ARID3A_A 1

ARID5B_A 1

ARX_A 1

dim(dat1) # 14527 1

[1] 14527 1

dat1 = as.data.frame(dat1) dim(dat1)

[1] 14527 1

dat1$names = rownames(dat1) table(dat1$V1)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

4924 1156 3038 667 504 346 354 1004 1108 232 426 174 310 283 1

colnames(dat1)[1] = "community" #This is the previous results

new

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

4941 1155 3069 683 565 250 358 1022 1054 182 419 138 329 277 84 1

new2

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

4946 1182 3091 656 540 204 343 997 1054 174 449 434 321 135 1

simp.alp.list2 = simp.alp[[2]] head(simp.alp.list2)

ALX1_A ALX3_A ALX4_A ARID3A_A ARID5B_A ARX_A

0.005090088 0.005688390 0.005504105 0.006486973 0.003185671 0.005474239

simp.memb.dat2 <- as.vector(simp.alp.list2) names(simp.memb.dat2) <- names(simp.alp.list2) dat2 = as.matrix(simp.memb.dat2) head(dat2)

[,1]

ALX1_A 0.005090088

ALX3_A 0.005688390

ALX4_A 0.005504105

ARID3A_A 0.006486973

ARID5B_A 0.003185671

ARX_A 0.005474239

dim(dat2) # 14526 1

[1] 14526 1

dat2 = as.data.frame(dat2) dim(dat2)

[1] 14526 1

colnames(dat2)[1] = "modularity" data = merge(dat1, dat2, by = "names") dim(data) #14526 3

[1] 14526 3

head(data)

names community modularity

1 AHR_A 3 0.003070857

2 AIRE_A 1 0.005332563

3 ALX1_A 1 0.005090088

4 ALX3_A 1 0.005688390

5 ALX4_A 1 0.005504105

6 AR_A 14 0.069982247

sessionInfo()

R version 4.3.0 (2023-04-21 ucrt) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 10 x64 (build 19042)

Matrix products: default

locale: [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 LC_MONETARY=English_United States.utf8 [4] LC_NUMERIC=C LC_TIME=English_United States.utf8

time zone: Asia/Singapore tzcode source: internal

attached base packages: [1] stats graphics grDevices utils datasets methods base

other attached packages: [1] netZooR_1.4.0 matrixcalc_1.0-6 yarn_1.26.0 pandaR_1.32.0 Biobase_2.60.0
[6] BiocGenerics_0.46.0 reticulate_1.30 igraph_1.5.0

loaded via a namespace (and not attached): [1] STRINGdb_2.12.1 fs_1.6.2 matrixStats_1.0.0 bitops_1.0-7
[5] httr_1.4.6 RColorBrewer_1.1-3 doParallel_1.0.17 Rgraphviz_2.44.0
[9] repr_1.1.6 tools_4.3.0 doRNG_1.8.6 backports_1.4.1
[13] utf8_1.2.3 R6_2.5.1 vegan_2.6-4 HDF5Array_1.28.1
[17] mgcv_1.8-42 rhdf5filters_1.12.1 permute_0.9-7 uchardet_1.1.1
[21] prettyunits_1.1.1 base64_2.0.1 preprocessCore_1.62.1 cli_3.6.1
[25] penalized_0.9-52 readr_2.1.4 genefilter_1.82.1 askpass_1.1
[29] Rsamtools_2.16.0 pbdZMQ_0.3-9 siggenes_1.74.0 illuminaio_0.42.0
[33] AnnotationForge_1.42.2 scrime_1.3.5 plotrix_3.8-2 limma_3.56.2
[37] rstudioapi_0.14 RSQLite_2.3.1 generics_0.1.3 GOstats_2.66.0
[41] quantro_1.34.0 BiocIO_1.10.0 gtools_3.9.4 distributional_0.3.2
[45] dplyr_1.1.2 GO.db_3.17.0 Matrix_1.6-1 fansi_1.0.4
[49] S4Vectors_0.38.1 abind_1.4-5 lifecycle_1.0.3 yaml_2.3.7
[53] edgeR_3.42.4 SummarizedExperiment_1.30.2 gplots_3.1.3 rhdf5_2.44.0
[57] BiocFileCache_2.8.0 grid_4.3.0 blob_1.2.4 crayon_1.5.2
[61] lattice_0.21-8 GenomicFeatures_1.52.1 annotate_1.78.0 KEGGREST_1.40.0
[65] pillar_1.9.0 knitr_1.43 beanplot_1.3.1 GenomicRanges_1.52.0
[69] rjson_0.2.21 codetools_0.2-19 glue_1.6.2 downloader_0.4
[73] data.table_1.14.8 vctrs_0.6.2 png_0.1-8 gtable_0.3.3
[77] assertthat_0.2.1 gsubfn_0.7 cachem_1.0.8 xfun_0.39
[81] S4Arrays_1.0.4 rsconnect_1.1.0 survival_3.5-5 iterators_1.0.14
[85] nlme_3.1-162 Category_2.66.0 bit64_4.0.5 progress_1.2.2
[89] filelock_1.0.2 GenomeInfoDb_1.36.1 tensorA_0.36.2 nor1mix_1.3-0
[93] KernSmooth_2.23-20 colorspace_2.1-0 DBI_1.1.3 nnet_7.3-18
[97] processx_3.8.2 tidyselect_1.2.0 bit_4.0.5 compiler_4.3.0
[101] curl_5.0.1 chron_2.3-61 graph_1.78.0 xml2_1.3.4
[105] ggdendro_0.1.23 DelayedArray_0.26.3 posterior_1.4.1 rtracklayer_1.60.0
[109] checkmate_2.2.0 scales_1.2.1 caTools_1.18.2 hexbin_1.28.3
[113] quadprog_1.5-8 RBGL_1.76.0 rappdirs_0.3.3 stringr_1.5.0
[117] digest_0.6.31 rmarkdown_2.22 GEOquery_2.68.0 XVector_0.40.0
[121] htmltools_0.5.5 pkgconfig_2.0.3 base64enc_0.1-3 sparseMatrixStats_1.12.1
[125] MatrixGenerics_1.12.2 dbplyr_2.3.2 fastmap_1.1.1 rlang_1.1.1
[129] DelayedMatrixStats_1.22.1 farver_2.1.1 jsonlite_1.8.5 BiocParallel_1.34.2
[133] mclust_6.0.0 RCurl_1.98-1.12 magrittr_2.0.3 GenomeInfoDbData_1.2.10
[137] Rhdf5lib_1.22.0 IRkernel_1.3.2 munsell_0.5.0 Rcpp_1.0.10
[141] proto_1.0.0 sqldf_0.4-11 stringi_1.7.12 RJSONIO_1.3-1.8
[145] zlibbioc_1.46.0 MASS_7.3-58.4 plyr_1.8.8 bumphunter_1.42.0
[149] org.Hs.eg.db_3.17.0 minfi_1.46.0 parallel_4.3.0 Biostrings_2.68.1
[153] IRdisplay_1.1 splines_4.3.0 multtest_2.56.0 hash_2.2.6.2
[157] hms_1.1.3 locfit_1.5-9.8 ps_1.7.5 uuid_1.1-0
[161] RUnit_0.4.32 base64url_1.4 rngtools_1.5.2 reshape2_1.4.4
[165] biomaRt_2.56.1 stats4_4.3.0 XML_3.99-0.14 evaluate_0.21
[169] tzdb_0.4.0 foreach_1.5.2 tidyr_1.3.0 openssl_2.0.6
[173] purrr_1.0.1 reshape_0.8.9 ggplot2_3.4.2 xtable_1.8-4
[177] restfulr_0.0.15 viridisLite_0.4.2 RCy3_2.20.0 tibble_3.2.1
[181] memoise_2.0.1 AnnotationDbi_1.62.1 GenomicAlignments_1.36.0 IRanges_2.34.1
[185] cluster_2.1.4 cmdstanr_0.6.0.9000 GSEABase_1.62.0

marouenbg commented 12 months ago

Hi @meghapadi,

Can you help with this question?

marouenbg commented 11 months ago

@cchen22 Can you please help with this?

cchen22 commented 11 months ago

@lijinw , thank you for your interest in ALPACA! To address your question, ALPACA utilizes the Louvain algorithm for node clustering. It's worth noting that Louvain is considered heuristic, which implies that different community structures can result from various initializations. To mitigate this inherent stochasticity, you can explore alternative options such as CRANE, available in the netZooR package, or employ a consensus clustering algorithm to identify common ALPACA modules across multiple runs.

However, it's important to emphasize that ALPACA itself is designed with reproducibility in mind. This means that if you provide the exact same network datafram (e.g., "net.table" in ALPACA), you should obtain the exact same module structure. In your output, I cannot determine if your two datasets are indeed identical, particularly with regard to the gene and TF order. You can utilize the identical(data1, data2) function to verify this. If it returns true, and you still observe differences in the two ALPACA results, please keep us updated so that we can work together to resolve this issue.

lijinw commented 11 months ago

@marouenbg @cchen22 Thanks for your quick reply! I have redone the whole process. I've used the identical(data1, data2) function to confirm the identity of the input files, but the communities in the output are still different. First run setwd("E:\nBox\Lijin\Adipose_network_study\lijin\Netzoo\alpaca\sub_panda\new") long_regNet.obesity <- read.table("long_regNet.obesity_0906.txt",header=TRUE) dim(long_regNet.obesity) #8940652 3

[1] 8940652 3

head(long_regNet.obesity)

Transcript_name Gene_name Value

1 AHR ENSG00000000003 -0.60414255

2 AIRE ENSG00000000003 -0.04268767

3 ALX1 ENSG00000000003 0.22438224

4 ALX3 ENSG00000000003 0.38837990

5 ALX4 ENSG00000000003 0.28476211

6 AR ENSG00000000003 -0.71595508

long_regNet.lean <- read.table("long_regNet.lean_0906.txt",header=TRUE) dim(long_regNet.lean) #8940652 3

[1] 8940652 3

head(long_regNet.lean)

Transcript_name Gene_name Value

1 AHR ENSG00000000003 -0.7556592

2 AIRE ENSG00000000003 0.1703895

3 ALX1 ENSG00000000003 0.4482405

4 ALX3 ENSG00000000003 0.6122047

5 ALX4 ENSG00000000003 0.5033211

6 AR ENSG00000000003 -0.8505601

data1 = cbind(long_regNet.lean,long_regNet.obesity[ , 3] ) dim(data1)

[1] 8940652 4

colnames(data1)[3:4] = c("lean", "obesity") head(data1)

Transcript_name Gene_name lean obesity

1 AHR ENSG00000000003 -0.7556592 -0.60414255

2 AIRE ENSG00000000003 0.1703895 -0.04268767

3 ALX1 ENSG00000000003 0.4482405 0.22438224

4 ALX3 ENSG00000000003 0.6122047 0.38837990

5 ALX4 ENSG00000000003 0.5033211 0.28476211

6 AR ENSG00000000003 -0.8505601 -0.71595508

simp.alp <- alpaca(data1,NULL,verbose=FALSE)

[1] "Detecting communities in control network..."

Weights detected. Building condor object with weighted edges.

[1] "modularity of projected graph 0.239202871970019"

[1] "Q = 0.239447941941265"

[1] "Q = 0.239471159691995"

[1] "Q = 0.239471159691995"

[1] "Computing differential modularity matrix..."

[1] "Computing differential modules..."

[1] "Merging 14527 communities"

[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

[1] 13

[1] 14

[1] "Merging 28 communities"

[1] 1

[1] 2

[1] 3

[1] "Merging 15 communities"

[1] 1

[1] "Computing node scores..."

[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

[1] 13

[1] 14

[1] 15

head(simp.alp)

class(simp.alp)

[1] "list"

simp.alp.list1 <- simp.alp[[1]] simp.memb <- as.vector(simp.alp.list1) names(simp.memb) <- names(simp.alp.list1)

simp.memb

dat1 = as.matrix(simp.memb) head(dat1)

[,1]

ALX1_A 1

ALX3_A 1

ALX4_A 1

ARID3A_A 1

ARID5B_A 1

ARX_A 1

dim(dat1) # 14527 1

[1] 14527 1

dat1 = as.data.frame(dat1) dim(dat1)

[1] 14527 1

dat1$names = rownames(dat1) table(dat1$V1)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

4960 1169 3046 668 491 313 392 1117 1038 440 297 173 326 96 1

colnames(dat1)[1] = "community" simp.alp.list2 = simp.alp[[2]] head(simp.alp.list2)

ALX1_A ALX3_A ALX4_A ARID3A_A ARID5B_A ARX_A

0.005052630 0.005694581 0.005501466 0.006424355 0.003176428 0.005490759

simp.memb.dat2 <- as.vector(simp.alp.list2) names(simp.memb.dat2) <- names(simp.alp.list2) dat2 = as.matrix(simp.memb.dat2) head(dat2)

[,1]

ALX1_A 0.005052630

ALX3_A 0.005694581

ALX4_A 0.005501466

ARID3A_A 0.006424355

ARID5B_A 0.003176428

ARX_A 0.005490759

dim(dat2) # 14526 1

[1] 14526 1

dat2 = as.data.frame(dat2) dim(dat2)

[1] 14526 1

dat2$names = rownames(dat2) table(dat2$V1) colnames(dat2)[1] = "modularity"

data = merge(dat1, dat2, by = "names")

dim(data) #14526 3

head(data)

Second run library(netZooR) setwd("E:\nBox\Lijin\Adipose_network_study\lijin\Netzoo\alpaca\sub_panda\new") long_regNet.obesity <- read.table("long_regNet.obesity_0906.txt",header=TRUE) dim(long_regNet.obesity) #8940652 3

[1] 8940652 3

head(long_regNet.obesity)

Transcript_name Gene_name Value

1 AHR ENSG00000000003 -0.60414255

2 AIRE ENSG00000000003 -0.04268767

3 ALX1 ENSG00000000003 0.22438224

4 ALX3 ENSG00000000003 0.38837990

5 ALX4 ENSG00000000003 0.28476211

6 AR ENSG00000000003 -0.71595508

long_regNet.lean <- read.table("long_regNet.lean_0906.txt",header=TRUE) dim(long_regNet.lean) #8940652 3

[1] 8940652 3

head(long_regNet.lean)

Transcript_name Gene_name Value

1 AHR ENSG00000000003 -0.7556592

2 AIRE ENSG00000000003 0.1703895

3 ALX1 ENSG00000000003 0.4482405

4 ALX3 ENSG00000000003 0.6122047

5 ALX4 ENSG00000000003 0.5033211

6 AR ENSG00000000003 -0.8505601

data2 = cbind(long_regNet.lean,long_regNet.obesity[ , 3] ) dim(data2)

[1] 8940652 4

colnames(data2)[3:4] = c("lean", "obesity") head(data2)

Transcript_name Gene_name lean obesity

1 AHR ENSG00000000003 -0.7556592 -0.60414255

2 AIRE ENSG00000000003 0.1703895 -0.04268767

3 ALX1 ENSG00000000003 0.4482405 0.22438224

4 ALX3 ENSG00000000003 0.6122047 0.38837990

5 ALX4 ENSG00000000003 0.5033211 0.28476211

6 AR ENSG00000000003 -0.8505601 -0.71595508

simp.alp <- alpaca(data2,NULL,verbose=FALSE)

[1] "Detecting communities in control network..."

Weights detected. Building condor object with weighted edges.

[1] "modularity of projected graph 0.239064805562618"

[1] "Q = 0.239380911921786"

[1] "Q = 0.239380911921786"

[1] "Computing differential modularity matrix..."

[1] "Computing differential modules..."

[1] "Merging 14527 communities"

[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

[1] 13

[1] "Merging 29 communities"

[1] 1

[1] 2

[1] 3

[1] "Merging 18 communities"

[1] 1

[1] "Computing node scores..."

[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

[1] 13

[1] 14

[1] 15

[1] 16

[1] 17

[1] 18

head(simp.alp)

class(simp.alp)

[1] "list"

simp.alp.list1 <- simp.alp[[1]] simp.memb <- as.vector(simp.alp.list1) names(simp.memb) <- names(simp.alp.list1)

simp.memb

dat1 = as.matrix(simp.memb) head(dat1)

[,1]

ALX1_A 1

ALX3_A 1

ALX4_A 1

ARID3A_A 1

ARID5B_A 1

ARX_A 1

dim(dat1) # 14527 1

[1] 14527 1

dat1 = as.data.frame(dat1) dim(dat1)

[1] 14527 1

dat1$names = rownames(dat1) table(dat1$V1)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

4952 1144 3105 678 559 188 359 960 1065 185 439 143 309 280 75 84

17 18

1 1

colnames(dat1)[1] = "community" simp.alp.list2 = simp.alp[[2]] head(simp.alp.list2)

ALX1_A ALX3_A ALX4_A ARID3A_A ARID5B_A ARX_A

0.004968234 0.005659787 0.005476996 0.006372186 0.003226377 0.005493390

simp.memb.dat2 <- as.vector(simp.alp.list2) names(simp.memb.dat2) <- names(simp.alp.list2) dat2 = as.matrix(simp.memb.dat2) head(dat2)

[,1]

ALX1_A 0.004968234

ALX3_A 0.005659787

ALX4_A 0.005476996

ARID3A_A 0.006372186

ARID5B_A 0.003226377

ARX_A 0.005493390

dim(dat2) # 14526 1

[1] 14525 1

dat2 = as.data.frame(dat2) dim(dat2)

[1] 14525 1

dat2$names = rownames(dat2) table(dat2$V1)

colnames(dat2)[1] = "modularity"

data = merge(dat1, dat2, by = "names")

dim(data) #14526 3

head(data)

``

identical(data1, data2)

[1] TRUE

cchen22 commented 11 months ago

@lijinw , this is a very interesting question. I suspect that this might be related to CONDOR. In your two runs, during the first step (the CONDOR step to find communities in the control network), the modularity scores differ slightly. For instance:

In your 1st run, you have "modularity of projected graph 0.239202871970019," In your 2nd run, you have "modularity of projected graph 0.239064805562618."

Have you tried something like set.seed(123); alpaca(data1,NULL,verbose=FALSE) and set.seed(123); alpaca(data2,NULL,verbose=FALSE) to see if you can obtain the same outputs?

If you can provide a small reproducible example, we could assist you further.

Best, Chen

lijinw commented 11 months ago

@cchen22 After setting the seed, I noticed that the results remained consistent. I decided to go with the number of communities that appeared most frequently. Your assistance has been greatly appreciated. Thank you so much!