Open marcucio opened 7 years ago
@marcucio Not currently, but it could probably be added easily - want to send a PR?
I actually also have the variant files in the vcf format which will be easier to work with. I could write the parser for it but I need to figure out how to find rsids. Any suggestions on how to find the rsids? This is how the info is formatted:
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CL-25819
1 17365 . C G 298.77 HardFilter AC=1;AF=0.500;AN=2;BaseQRankSum=1.13;ClippingRankSum=-1.271e+00;DP=28;FS=15.204;GQ_MEAN=327.00;MLEAC=1;MLEAF=0.500;MQ=35.82;MQ0=0;MQRankSum=1.56;NCC=0;QD=10.67;ReadPosRankSum=0.408;SOR=4.003 GT:AD:DP:GQ:PL 0/1:18,10:28:99:327,0,568
1 69270 . A G 100.90 HardFilter AC=2;AF=1.00;AN=2;DP=5;FS=0.000;GQ_MEAN=15.00;MLEAC=2;MLEAF=1.00;MQ=25.00;MQ0=0;NCC=0;QD=20.18;SOR=3.611 GT:AD:DP:GQ:PL 1/1:0,5:5:15:129,15,0
1 69511 . A G 805.77 HardFilter AC=2;AF=1.00;AN=2;DP=25;FS=0.000;GQ_MEAN=75.00;MLEAC=2;MLEAF=1.00;MQ=34.76;MQ0=0;NCC=0;QD=32.23;SOR=1.609 GT:AD:DP:GQ:PL 1/1:0,25:25:75:834,75,0
1 866511 . C CCCCT 529.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=0.454;ClippingRankSum=1.03;DP=28;FS=0.000;GQ_MEAN=241.00;MLEAC=1;MLEAF=0.500;MQ=60.39;MQ0=0;MQRankSum=0.866;NCC=0;QD=19.87;ReadPosRankSum=-5.360e-01;SOR=1.721 GT:AD:DP:GQ:PL 0/1:6,14:20:99:558,0,241
1 871334 . G T 389.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=-3.377e+00;ClippingRankSum=-4.400e-02;DP=19;FS=0.000;GQ_MEAN=197.00;MLEAC=1;MLEAF=0.500;MQ=60.00;MQ0=0;MQRankSum=-7.450e-01;NCC=0;QD=20.51;ReadPosRankSum=-4.400e-02;SOR=1.609 GT:AD:DP:GQ:PL 0/1:6,13:19:99:418,0,197
1 874778 . GCCTCCCCAGCCACGGTGAGGACCCACCCTGGCATGATCCCCCTCATCA G 100.77 HardFilter AC=1;AF=0.500;AN=2;BaseQRankSum=0.618;ClippingRankSum=0.134;DP=51;FS=5.358;GQ_MEAN=129.00;MLEAC=1;MLEAF=0.500;MQ=58.96;MQ0=0;MQRankSum=-1.316e+00;NCC=0;QD=0.20;ReadPosRankSum=0.264;SOR=2.206 GT:AD:DP:GQ:PL 0/1:26,5:31:99:129,0,3974
1 876499 . A G 951.77 PASS AC=2;AF=1.00;AN=2;DP=25;FS=0.000;GQ_MEAN=75.00;MLEAC=2;MLEAF=1.00;MQ=60.00;MQ0=0;NCC=0;QD=27.64;SOR=2.215 GT:AD:DP:GQ:PL 1/1:0,25:25:75:980,75,0
1 877715 . C G 1014.77 PASS AC=2;AF=1.00;AN=2;DP=28;FS=0.000;GQ_MEAN=84.00;MLEAC=2;MLEAF=1.00;MQ=60.00;MQ0=0;NCC=0;QD=31.26;SOR=0.997 GT:AD:DP:GQ:PL 1/1:0,28:28:84:1043,84,0
1 877831 . T C 794.77 PASS AC=2;AF=1.00;AN=2;DP=23;FS=0.000;GQ_MEAN=69.00;MLEAC=2;MLEAF=1.00;MQ=60.00;MQ0=0;NCC=0;QD=34.56;SOR=1.179 GT:AD:DP:GQ:PL 1/1:0,23:23:69:823,69,0
1 879317 . C T 835.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=2.91;ClippingRankSum=-6.930e-01;DP=48;FS=6.261;GQ_MEAN=660.00;MLEAC=1;MLEAF=0.500;MQ=60.00;MQ0=0;MQRankSum=-3.100e-
We could pull in the SNP data from the NIH?
ftp://ftp.ncbi.nih.gov/snp/database/organism_data/human_9606 (you can open this in chrome)
Yea, I was looking at that. I was hoping there would be a json file somewhere where I can easily query the SNP data. Maybe that will be a separate project to write a small app that pulls from that site and makes a json file that we can use. Hopefully I can find some time in the next few days to look into that part of it further.
Thanks,
-- Mike Marcucio marcucio.com http://marcucio.com/ mike@marcucio.com mailto:mike@marcucio.com
blog http://blog.getitdoneapp.com/ | twitter https://twitter.com/GetItDoneBlog | facebook https://www.facebook.com/GetItDoneTasks | google+ https://plus.google.com/u/0/b/116698933997097774721/+Marcucio/posts
On Mar 30, 2017, at 10:09 AM, contra notifications@github.com wrote:
We could pull in the SNP data from the NIH?
ftp://ftp.ncbi.nih.gov/snp
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/genomejs/dna2json/issues/23#issuecomment-290422227, or mute the thread https://github.com/notifications/unsubscribe-auth/AA4JXh_51Cm8C33VGiUFmvQORaPmCjxdks5rq7eogaJpZM4MteBk.
slightly offtopic, but I also wrote a small app that pulls gene data from snpedia.com:
var http = require('http');
var fs = require('fs');
// print process.argv
var geneToDownload = process.argv[2];
if (!geneToDownload) {
return console.log('Error');
}
console.log('Download: ' + geneToDownload);
var dlPath = '../genes/'+geneToDownload+'.json';
if (fs.existsSync(dlPath)) {
return console.log('File exists');
}
var file = fs.createWriteStream(dlPath);
var request = http.get('http://www.snpedia.com/index.php/Special:Ask/-5B-5BCategory:Is-20a-20snp-5D-5D-20-5B-5BIn-20gene::'+geneToDownload+'-5D-5D/-3FMax-20Magnitude/-3FChromosome-20position/-3FSummary/-3FRiskGeno/mainlabel%3D/limit%3D1000/prettyprint%3Dtrue/format%3Djson', function(response) {
response.pipe(file);
});
so we can do:
node downloadGeneJSON.js PCDH19
And this is what we need to do to query the genome:
var gql = require('gql');
var geneData = require('./PCDH19.json');
var geneArr = [];
for (var x in geneData.results) {
geneArr.push(x);
}
(function(exports){
exports.exists = gql.and(geneArr.map(function (id) { return gql.exists(id);}));
exports.url = 'http://www.snpedia.com/index.php/PCDH19';
exports.label = 'PCDH19';
exports.description = 'Gene commonly associated with genetic epilepsy';
})(exports);
Maybe I will see if I can make something simular to download and format the snp data into json
@marcucio It would be really cool to have a repo with the files and publish it as a node module - similar to this kind of a setup: https://github.com/contra/boundaries
Or if we could put it all in one JSON file and publish it, this repo could lean on it as a dependency for the parser.
BTW - that's really cool for pulling from SNPedia. I feel like we could do something interesting there, either one module for all genes or one for each (tedious to maintain).
I was thinking one repo for all genes/genosets, my idea was to make an app that would report on all known genes/genosets found and any known data about it. Maybe the report will even link to sites like this: https://ghr.nlm.nih.gov/gene/KCNQ2 https://ghr.nlm.nih.gov/gene/KCNQ2 or this http://diseases.jensenlab.org/Search?query=KCNQ2 http://diseases.jensenlab.org/Search?query=KCNQ2
Having as a separate repo that contains all genes would make sense in my case so that I will only have to import 1 project and it will update to the latest info anytime I do "npm update”.
does this library support the fastq format:
https://en.wikipedia.org/wiki/FASTQ_format