Per #15 and ensuing blahblah, here's the unvarnished code.
If it breaks, you get to keep the pieces. :-)
library(GoogleGenomics)
##
## may need to do this interactively:
##
authenticate('client_secrets.json')
##
## the rest should not require user input
##
variant <- getVariants(datasetId = "10473108253681171589", chromosome = "22",
start = 16051400, end = 16051500, oneBasedCoord = FALSE,
fields = NULL, converter = c)
variantVR <- variantsToVRanges(variant, slStyle = 'UCSC')
##
## Replace the middle base of a DNAStringSet with the alternate SNP allele
## (only tested for SNPs where both reference and alternate are 1bp wide!)
## There are so very many ways in which this is horrible...
##
replaceWithAlt <- function(stringSet, position, alt) {
strings <- as.character(stringSet)
substr(strings, position, position) <- as.character(alt)
return(DNAStringSet(strings))
}
##
## Use the BSgenome representation of hg19 to play with the ref/alt sequences:
## One base on each side, what happens when we go from reference to alternate?
## Major issue: how to know what reference assembly the variants come from?
##
library(BSgenome.Hsapiens.UCSC.hg19)
##
## Absent a reference assembly, the above (a guess!) could be very wrong...
## Hence issue #13 in the github repository
##
variantVR$context <- getSeq(Hsapiens, resize(variantVR, 3, fix='center'))
variantVR$altcontext <- replaceWithAlt(variantVR$context, 2, alt(variantVR))
show(variantVR)
##
## what is the consequence (what has typically been "lost" to the reference?)
## just in terms of simple doublets, nothing fancy here
##
refCpG <- length( grep('CG', variantVR$context) ) / length( variantVR )
altCpG <- length( grep('CG', variantVR$altcontext) ) / length( variantVR )
##
## So half of a randomly selected set of 4 variants are "losses" of CpG sites.
## Given the sample size we obviously can't conclude anything, but purging of
## CpG sites from the genome (via spontaneous 5mC -> T deamination) is a steady
## process that has heavily depleted the "average" genome of said doublets.
##
print(paste0('In this sample, ',
(altCpG - refCpG)*100, '% of ref-to-alt changes create a CpG.'))
##
## It is interesting (NOT rigorous evidence of any sort) that a randomly picked
## set of SNPs from the 1KGP participants would happen to illustrate the theme.
Per #15 and ensuing blahblah, here's the unvarnished code.
If it breaks, you get to keep the pieces. :-)