Closed JNajar closed 5 months ago
Could you reformat the code blocks please?
## calculate LD
for (chr in 1:22) {
# Extract SNPs that are included in the chromosome
ind.chr <- which(info_snp$chr == chr)
ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
# Calculate the LD
corr0 <- snp_cor(
genotype,
ind.col = ind.chr2,
ncores = NCORES,
infos.pos = POS2[ind.chr2],
size = 3 / 1000
)
if (chr == 1) {
ld <- Matrix::colSums(corr0^2)
corr <- as_SFBM(corr0, tmp)
} else {
ld <- c(ld, Matrix::colSums(corr0^2))
corr$add_columns(corr0, nrow(corr))
}
}
There were 14 warnings (use warnings() to see them)
warnings()
Warning messages:
1: NA or NaN values in the resulting correlation matrix.
2: NA or NaN values in the resulting correlation matrix.
3: NA or NaN values in the resulting correlation matrix.
4: NA or NaN values in the resulting correlation matrix.
5: NA or NaN values in the resulting correlation matrix.
6: NA or NaN values in the resulting correlation matrix.
7: NA or NaN values in the resulting correlation matrix.
8: NA or NaN values in the resulting correlation matrix.
9: NA or NaN values in the resulting correlation matrix.
10: NA or NaN values in the resulting correlation matrix.
11: NA or NaN values in the resulting correlation matrix.
12: NA or NaN values in the resulting correlation matrix.
13: NA or NaN values in the resulting correlation matrix.
14: NA or NaN values in the resulting correlation matrix.
##I still tried to go through with the script:
## We assume the fam order is the same across different chromosomes
fam.order <- as.data.table(obj.bigSNP$fam)
## Rename fam order
setnames(fam.order,
c("family.ID", "sample.ID"),
c("FID", "IID"))
# 6. Perform LD score regression
df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
ldsc <- snp_ldsc( ld,
length(ld),
chi2 = (df_beta$beta / df_beta$beta_se)^2,
sample_size = df_beta$n_eff,
blocks = NULL)
Error in if (max(abs(pred - pred0)) < 1e-06) break :
missing value where TRUE/FALSE needed ##got this error instead
length(ld)
[1] 1049911
summary(ld)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
1.175 10.654 16.905 21.566 26.312 474.208 9980
## Remove all NAs from the ld
ld2 <- na.omit(ld)
length(ld2)
[1] 1039931
length(ld)
[1] 1049911
## We assume the fam order is the same across different chromosomes
fam.order <- as.data.table(obj.bigSNP$fam)
## Rename fam order
setnames(fam.order,
c("family.ID", "sample.ID"),
c("FID", "IID"))
# 6. Perform LD score regression
df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
ldsc <- snp_ldsc( ld2,
length(ld2),
chi2 = (df_beta$beta / df_beta$beta_se)^2,
sample_size = df_beta$n_eff,
blocks = NULL)
Error: Incompatibility between dimensions.
'chi2' and 'ld_score' should have the same length. ## but this did of course not work
str(obj.bigSNP)
List of 3
$ genotypes:Reference class 'FBM.code256' [package "bigstatsr"] with 16 fields
..$ extptr :<externalptr>
..$ extptr_rw :<externalptr>
..$ nrow : int 3981
..$ ncol : int 1054151
..$ type : Named int 1
.. ..- attr(*, "names")= chr "unsigned char"
..$ backingfile : chr "/project/holstegelab/Share/gwas_array/TOPMED/analyses/JennaNajar/H70-study/Xinxin_Caroline/data/genotypes/LDPre"| __truncated__
..$ is_read_only: logi FALSE
..$ address :<externalptr>
..$ address_rw :<externalptr>
..$ bk : chr "/project/holstegelab/Share/gwas_array/TOPMED/analyses/JennaNajar/H70-study/Xinxin_Caroline/data/genotypes/LDPre"| __truncated__
..$ rds : chr "/project/holstegelab/Share/gwas_array/TOPMED/analyses/JennaNajar/H70-study/Xinxin_Caroline/data/genotypes/LDPre"| __truncated__
..$ is_saved : logi TRUE
..$ type_chr : chr "unsigned char"
..$ type_size : int 1
..$ file_size : num 4.2e+09
..$ code256 : num [1:256] 0 1 2 NA NA NA NA NA NA NA ...
..and 26 methods, of which 12 are possibly relevant:
.. add_columns, as.FBM, bm, bm.desc, check_dimensions,
.. check_write_permissions, copy#envRefClass, initialize, initialize#FBM,
.. save, show#envRefClass, show#FBM
$ fam :'data.frame': 3981 obs. of 6 variables:
..$ family.ID : int [1:3981] 0 0 0 0 0 0 0 0 0 0 ...
..$ sample.ID : chr [1:3981] "1234_1234" "1284_1284" "1300_1300" "1345_1345" ...
..$ paternal.ID: int [1:3981] 0 0 0 0 0 0 0 0 0 0 ...
..$ maternal.ID: int [1:3981] 0 0 0 0 0 0 0 0 0 0 ...
..$ sex : int [1:3981] 0 0 0 0 0 0 0 0 0 0 ...
..$ affection : int [1:3981] -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 ...
$ map :'data.frame': 1054151 obs. of 6 variables:
..$ chromosome : int [1:1054151] 1 1 1 1 1 1 1 1 1 1 ...
..$ marker.ID : chr [1:1054151] "rs3131972" "rs3131969" "rs1048488" "rs12562034" ...
..$ genetic.dist: int [1:1054151] 0 0 0 0 0 0 0 0 0 0 ...
..$ physical.pos: int [1:1054151] 752721 754182 760912 768448 779322 838555 846808 853954 854250 864938 ...
..$ allele1 : chr [1:1054151] "G" "G" "T" "A" ...
..$ allele2 : chr [1:1054151] "A" "A" "C" "G" ...
- attr(*, "class")= chr "bigSNP"
> str(pheno)
'data.frame': 3981 obs. of 3 variables:
$ FirstColumn: int 0 0 0 0 0 0 0 0 0 0 ...
$ IID : chr "ADCL1_ADCL1" "ADCL2_ADCL2" "ADCL3_ADCL3" "ADCL4_ADCL4" ...
$ FID : int 0 0 0 0 0 0 0 0 0 0 ...
Just compute snp_MAF(genotype, ind.col = info_snp$`_NUM_ID_`, ncores = NCORES)
and filter out small MAFs from info_snp
.
I'll probably need to add this step in the tuto as well.
Should this step be after perfoming SNP matching?
Like this: sumstats$chr <- as.integer(sumstats$chr) #make chr an integer info_snp <- snp_match(sumstats, map)
snp_MAF(genotype, ind.col = info_snp$_NUM_ID_
, ncores = NCORES) # filter out small MAFs from info_snp.
Or after ld calculations?
Before the loop.
I still get the same errors...
sumstats$chr <- as.integer(sumstats$chr) #make chr an integer info_snp <- snp_match(sumstats, map)
snp_MAF(genotype, ind.col = info_snp$_NUM_ID_
, ncores = NCORES) # filter out small MAFs from info_snp.
dim(info_snp)
write.table(info_snp, "/project/holstegelab/Share/gwas_array/TOPMED/analyses/JennaNajar/H70-study/results/SCZPRS_LDPred2_sumstats.txt", sep = "\t", quote = FALSE, row.names = FALSE, col.names=T)
genotype <- obj.bigSNP$genotypes
CHR <- map$chr POS <- map$pos
POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
for (chr in 1:22) {
ind.chr <- which(info_snp$chr == chr)
ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
# Calculate the LD
corr0 <- snp_cor(
genotype,
ind.col = ind.chr2,
ncores = NCORES,
infos.pos = POS2[ind.chr2],
size = 3 / 1000
)
if (chr == 1) {
ld <- Matrix::colSums(corr0^2)
corr <- as_SFBM(corr0, tmp)
} else {
ld <- c(ld, Matrix::colSums(corr0^2))
corr$add_columns(corr0, nrow(corr))
}
} There were 14 warnings (use warnings() to see them)
fam.order <- as.data.table(obj.bigSNP$fam)
setnames(fam.order, c("family.ID", "sample.ID"), c("FID", "IID"))
df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUMID")]
ldsc <- snp_ldsc( ld, length(ld), chi2 = (df_beta$beta / df_beta$beta_se)^2, sample_size = df_beta$n_eff, blocks = NULL) Error in if (max(abs(pred - pred0)) < 1e-06) break : missing value where TRUE/FALSE needed
You need to use the result from snp_MAF()
to filter info_snp
.
Look at the tutorial; I've just updated it.
Also, please use the code formatting from GitHub, the '<>' button after selecting your code.
Thank you for the tutorial
I used this code before the loop:
ind.row <- rows_along(genotype)
maf <- snp_MAF(genotype, ind.row = ind.row, ind.col = info_snp$_NUM_ID_
, ncores = NCORES)
maf_thr <- 1 / sqrt(length(ind.row)) # threshold privefl likes to use
info_snp <- info_snp[maf > maf_thr, ]
However, i still get NA or NAN values, but now instead of 14 warnings I have 9.
What do you get for
ind <- which(is.na(ld))
length(ind)
summary(snp_MAF (genotype, ind.col = info_snp$`_NUM_ID_`[ind]))
summary(big_counts(genotype, ind.col = info_snp$`_NUM_ID_`[ind])[4, ])
ind <- which(is.na(ld))
length(ind) [1] 6212
summary(snp_MAF (genotype, ind.col = info_snp$_NUM_ID_
[ind]))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.01595 0.12572 0.24221 0.25118 0.38015 0.50000
summary(big_counts(genotype, ind.col = info_snp$_NUM_ID_
[ind])[4, ])
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 0 0 0
MAFs are fine, and you have absolutely no missing value, so everything should be fine. I don't know what's going on.
If you want to move on, just do corr0@x[is.na(corr0@x)] <- 0
after computing corr0
.
So strange.
The calculation of corr0 is within the loop, so after the loop you mean? If so, when i do that, im still getting
ldsc <- snp_ldsc( ld, length(ld), chi2 = (df_beta$beta / df_beta$beta_se)^2, sample_size = df_beta$n_eff, blocks = NULL) Error in if (max(abs(pred - pred0)) < 1e-06) break : missing value where TRUE/FALSE needed
Within the loop, just after computing corr0
.
It worked! Thank you so much for your help.
It's just a workaround. It would be good to understand why there are still NaNs values.
Do you have an idea of where i could start tracking it down?
Take one NaN value in corr0
, get the two corresponding column indices in genotype
and try to compute cor()
between these two variants.
I am calculating a PRS for sczhizophrenia, using the script found here: https://choishingwan.github.io/PRS-Tutorial/ldpred/ and using the Genome wide bed file option.
This exact script has been used successfully with a different set of genotype data. The genotype data that i am using has been QC and imputed using Sanger imputation service.
When i try to remove NAs from ld, then i cannot calculate ldsc:
I have inluced the info on the IID dataset and the obj.bigSNP
Do you possibly know why i am getting these warnings in the LD calculation? What can i do troubleshoot this?
Kind regards,