Closed ArtPoon closed 2 years ago
Received API documentation, need to write script to automate this transaction
(venv) [poona8@ covizu]$ python3 retrieve.py
[2021-10-14T18:14:24.770663] Caching download in memory
[2021-10-14T18:14:30.265107] Extracting file names in tar
[2021-10-14T18:14:34.606680] Exporting compressed files
[2021-10-14T18:20:23.333927] exporting metadata TSV to gzip-compressed file
[2021-10-14T18:20:24.087510] Wrote outputs to virusseq.2021-10-14T18:14:34.fasta.xz and virusseq.2021-10-14T18:14:34.metadata.tsv.gz
Need to update convert.py
:
diff --git a/convert.py b/convert.py
index d186023..9585581 100644
--- a/convert.py
+++ b/convert.py
@@ -49,18 +49,18 @@ def parse_metadata(handle, set_region=None, delimiter=',', callback=None):
'coldate': row[args.coldate],
'region': row.get(args.region, set_region),
'country': row[args.country],
- 'division': row.get(args.division, None)
+ 'division': row.get(args.division, None),
+ 'lineage': row[args.lineage]
}})
return metadata
-def combine(handle, lineage_file, metadata, minlen=29000, mindate='2019-12-01', callback=None):
+def combine(handle, metadata, minlen=29000, mindate='2019-12-01', callback=None):
"""
Combine FASTA and metadata records into dictionary objects and do some basic filtering.
:param handle: file handle, open in read mode to FASTA
- :param lineage_file: file handle, open in read mode to Pangolin CSV output
:param metadata: dict, returned from parse_metadata()
:param minlen: int, minimum genome length
:param mindate: str, earliest sample collection date in ISO-8601 format
@@ -70,17 +70,6 @@ def combine(handle, lineage_file, metadata, minlen=29000, mindate='2019-12-01',
"""
mindate = seq_utils.fromisoformat(mindate)
- # parse CSV output from Pangolin
- reader = DictReader(lineage_file)
- if 'taxon' not in reader.fieldnames or 'lineage' not in reader.fieldnames:
- if callback:
- callback("Lineage CSV header does not match expected.", level='ERROR')
- sys.exit()
-
- lineages = {}
- for row in reader:
- lineages.update({row['taxon']: row['lineage']})
-
rejects = {'short': 0, 'baddate': 0}
for label, seq in seq_utils.iter_fasta(handle):
if label not in metadata:
@@ -102,11 +91,11 @@ def combine(handle, lineage_file, metadata, minlen=29000, mindate='2019-12-01',
rejects['baddate'] += 1
continue # reject records with non-sensical collection date
- lineage = lineages.get(label, None)
+ lineage = metadata[label]['lineage']
if lineage is None:
if callback:
callback(
- "Failed to retrieve lineage assignment for {}".format(header),
+ "Failed to retrieve lineage assignment for {}".format(label),
level='ERROR'
)
sys.exit()
@@ -145,24 +134,27 @@ def parse_args():
"file.")
compression = parser.add_mutually_exclusive_group()
- compression.add_argument("--gzip", action="store_true", help="FASTA is gzip-compressed.")
+ compression.add_argument("--gz", action="store_true", help="FASTA is gzip-compressed.")
compression.add_argument("--xz", action="store_true", help="FASTA is xz-compressed.")
- parser.add_argument("lineages", type=argparse.FileType('r'),
- help="input, CSV output generated by Pangolin")
- parser.add_argument("metadata", type=argparse.FileType('r'),
- help="input, CSV containing metadata")
-
- parser.add_argument("--delimiter", type=unescaped_str, default=',',
- help="delimiter character for metadata CSV; "
- "use '\t' if tab-delimited")
- parser.add_argument("--name", type=str, default="name",
- help="column label for virus sample name in metadata CSV; "
+ parser.add_argument("metadata", type=str,
+ help="input, CSV containing metadata, including Pango lineage")
+ meta_compress = parser.add_mutually_exclusive_group()
+ meta_compress.add_argument("--mgz", action="store_true", help="metadata file is gzip-compressed")
+ meta_compress.add_argument("--mxz", action="store_true", help="metadata file is xz-compressed")
+
+ parser.add_argument("--delimiter", type=unescaped_str, default='\t',
+ help="delimiter character for metadata TSV; "
+ "use ',' if comma-delimited")
+ parser.add_argument("--name", type=str, default="strain",
+ help="column label for virus sample name in metadata; "
"required, must match FASTA")
- parser.add_argument("--accession", type=str, default="accession",
+ parser.add_argument("--accession", type=str, default="genbank_accession",
help="column label for accession number; required")
parser.add_argument("--coldate", type=str, default="date",
help="column label for collection date in metadata CSV; required")
+ parser.add_argument("--lineage", type=str, default="pango_lineage",
+ help="column label for Pango lineage classification; required")
# geographical metadata
parser.add_argument("--region", type=str, default=None,
@@ -176,9 +168,6 @@ def parse_args():
help="column label for country division (e.g., province) in metadata CSV; "
"defaults to None")
- parser.add_argument("--bylineage", type=str, default='data/by_lineage.json',
- help="path to write JSON of features by lineage")
-
parser.add_argument('--minlen', type=int, default=29000, help='minimum genome length (nt)')
parser.add_argument('--mindate', type=str, default='2019-12-01',
help='earliest possible sample collection date (ISO format, default '
@@ -192,14 +181,6 @@ if __name__ == "__main__":
cb = Callback()
callback = cb.callback
- # determine how to handle input FASTA file
- if args.gzip:
- handle = gzip.open(args.infile, 'rt')
- elif args.xz:
- handle = lzma.open(args.infile, 'rt')
- else:
- handle = open(args.infile)
-
# determine how to output results
if args.outfile:
outfile = lzma.open(args.outfile, 'wt')
@@ -210,10 +191,27 @@ if __name__ == "__main__":
sys.stderr.write("Streaming results to standard output, deactivating callback.")
callback = None # deactivate callback
- metadata = parse_metadata(args.metadata, set_region=args.set_region, delimiter=args.delimiter,
+ # process input metadata file
+ if args.mgz:
+ handle = gzip.open(args.metadata, 'rt')
+ elif args.mxz:
+ handle = lzma.open(args.metadata, 'rt')
+ else:
+ handle = open(args.metadata)
+ metadata = parse_metadata(handle, set_region=args.set_region, delimiter=args.delimiter,
callback=callback)
+ handle.close()
- feed = combine(handle, args.lineages, metadata, minlen=args.minlen, mindate=args.mindate,
+ # process input FASTA file
+ if args.gz:
+ handle = gzip.open(args.infile, 'rt')
+ elif args.xz:
+ handle = lzma.open(args.infile, 'rt')
+ else:
+ handle = open(args.infile)
+ feed = combine(handle, metadata, minlen=args.minlen, mindate=args.mindate,
callback=callback)
for record in feed:
outfile.write(json.dumps(record)+'\n')
+ handle.close()
+ outfile.close()
$ python3 convert.py data/sequences.fasta.xz data/metadata.tsv.gz --xz --mgz -o data/converted.json.xz
🏄 [2:05:44.306679] Rejected 4950 short genomes, 14284 records with bad dates
This took two hours! Need to think about reading data into memory for processing instead of writing out a .json.xz
file - re-compressing the data is taking too long.
Comparing processing times and file sizes:
art@orolo:~/git/covizu$ python3 retrieve.py
[2021-10-16T12:32:18.338287] Caching download in memory
[2021-10-16T12:32:45.147105] Extracting file names in tar
[2021-10-16T12:32:49.292037] Exporting compressed files
[2021-10-16T12:35:16.565273] exporting metadata TSV to gzip-compressed file
[2021-10-16T12:35:17.304534] Wrote outputs to virusseq.2021-10-16T12:32:49.fasta.gzip and virusseq.2021-10-16T12:32:49.metadata.tsv.gz
art@orolo:~/git/covizu$ vi retrieve.py
art@orolo:~/git/covizu$ python3 retrieve.py
[2021-10-16T12:36:52.815207] Caching download in memory
[2021-10-16T12:37:19.508726] Extracting file names in tar
[2021-10-16T12:37:23.663227] Exporting compressed files
[2021-10-16T12:42:41.344502] exporting metadata TSV to gzip-compressed file
[2021-10-16T12:42:42.079632] Wrote outputs to virusseq.2021-10-16T12:37:23.fasta.xz and virusseq.2021-10-16T12:37:23.metadata.tsv.gz
art@orolo:~/git/covizu$ ls -lt | head
total 2280168
-rw-rw-r-- 1 art art 2120711 Oct 16 12:42 virusseq.2021-10-16T12:37:23.metadata.tsv.gz
-rw-rw-r-- 1 art art 4455876 Oct 16 12:42 virusseq.2021-10-16T12:37:23.fasta.xz
-rw-rw-r-- 1 art art 4602 Oct 16 12:36 retrieve.py
-rw-rw-r-- 1 art art 2120711 Oct 16 12:35 virusseq.2021-10-16T12:32:49.metadata.tsv.gz
-rw-rw-r-- 1 art art 110933065 Oct 16 12:35 virusseq.2021-10-16T12:32:49.fasta.gzip
FASTA to gzip: 147 seconds, 106 Mb FASTA to xz: 318 seconds, 4.3 Mb
art@orolo:~/git/covizu$ conda activate pangolin; pangolin virusseq.2021-10-16T12:37:23.fasta.xz
This took too long, see #315
(pangolin) art@orolo:~/git/covizu$ time mpirun -np 8 python3 mangolin.py virusseq.2021-10-16T12\:37\:23.fasta.xz
(1/8) processing 1000 sequences
(6/8) processing 1000 sequences
...
(5/8) processing 788 sequences
(0/8) processing 789 sequences
real 5m52.020s
user 47m46.638s
sys 22m12.273s
head -n1 mangolin.0.csv > combined.csv && tail -n+2 -q mangolin.*.csv >> combined.csv
to combine mangolin outputs
Weird bug:
art@orolo:~/git/covizu$ python3 process.py data/virusseq.2021-10-16T12\:37\:23.fasta.xz data/virusseq.2021-10-16T12\:37\:23.pangolin.csv data/virusseq.2021-10-16T12\:37\:23.metadata.tsv.gz data/sequences.fasta.xz data/metadata.tsv.gz
🦠 [0:00:00.528051] Failed to retrieve lineage assignment for hCoV-19/CANADA/ABPHL-13591/2021
Note combined.csv
from above was moved to data/virusseq.2021-10-16T12\:37\:23.pangolin.csv
.
Ok, one of the fields is capitalizing names. Why is that?
art@orolo:~/git/covizu$ unxz -c data/virusseq.2021-10-16T12\:37\:23.fasta.xz | grep ABPHL-13591
>hCoV-19/Canada/ABPHL-13591/2021
art@orolo:~/git/covizu$ gunzip -c data/virusseq.2021-10-16T12\:37\:23.metadata.tsv.gz | grep ABPHL-13591
ABPL-AB AB_61094 Alberta ProvLab North (APLN) Alberta ProvLab North (APLN) 2021-04-14 Canada Alberta Severe acute respiratory syndrome coronavirus 2 hCoV-19/CANADA/ABPHL-13591/2021 hCoV-19/Canada/ABPHL-13591/2021 Diagnostic testing
``
Using fasta header name
instead of isolate
for sequence name fieldname
Reran above:
art@orolo:~/git/covizu$ python3 process.py data/virusseq.2021-10-16T12\:37\:23.fasta.xz \
data/virusseq.2021-10-16T12\:37\:23.pangolin.csv data/virusseq.2021-10-16T12\:37\:23.metadata.tsv.gz \
data/sequences.fasta.xz data/metadata.tsv.gz
🏄 [0:00:04.091434] aligned 0 records
🏄 [0:00:06.397315] aligned 1000 records
...
🏄 [0:02:09.199039] aligned 55000 records
Traceback (most recent call last):
File "process.py", line 264, in <module>
analyze_feed(feed, args=args, callback=cb.callback)
File "process.py", line 124, in analyze_feed
by_lineage = sort_by_lineage(filtered, callback=callback)
File "/home/art/git/covizu/covizu/utils/batch_utils.py", line 250, in sort_by_lineage
for i, record in enumerate(records):
File "/home/art/git/covizu/covizu/utils/seq_utils.py", line 452, in filter_problematic
if qp.is_outlier(coldate, ndiffs):
File "/home/art/git/covizu/covizu/utils/seq_utils.py", line 192, in is_outlier
dt = (coldate - self.origin).days
TypeError: unsupported operand type(s) for -: 'NoneType' and 'datetime.date'
is_outlier
convert.py
to process.py
--- divergence tree saved in nexus format as
data/divergence_tree.nexus
🏄 [0:48:55.420173] Recoding features, compressing variants..
🏄 [0:49:24.866425] start MPI on minor lineages
🏄 [0:00:31.947175][0/1] starting B.1.438.1
Seems like a successful run so far. It's running in serial mode though.
Successful run:
art@orolo:~/git/covizu$ nohup python3 process.py -np 12 --limit 10000 --ft2bin fasttree2-mp \
data/virusseq.2021-10-16T12\:37\:23.fasta.xz \
data/virusseq.2021-10-16T12\:37\:23.pangolin.csv \
data/virusseq.2021-10-16T12\:37\:23.metadata.tsv.gz \
data/sequences.fasta.xz data/metadata.tsv.gz &
art@orolo:~/git/covizu$ tail -f nohup.out
🏄 [0:00:40.645717] aligned 0 records
...
🚧 [0:00:51.604438] Rejected short sequence: USA/GA-CDC-GA-EHC-081C/2020
🏄 [0:00:53.094941] aligned 6000 records
🏄 [0:00:53.854689] aligned 7000 records
🏄 [0:00:53.864199] filtered 2070 problematic features
🏄 [0:00:53.864252] 3896 genomes with excess missing sites
🏄 [0:00:53.864266] 1371 genomes with excess divergence
🏄 [0:00:53.864334] Parsing Pango lineage designations
🏄 [0:00:54.202278] Identifying lineage representative genomes
🏄 [0:00:54.258802] Reconstructing tree with fasttree2-mp
...
🏄 [0:02:49.923310] Recoding features, compressing variants..
🏄 [0:02:50.149360] start MPI on minor lineages
🏄 [0:00:00.732462][0/12] starting B.1.1.7
...
🏄 [0:24:57.395507][0/12] starting AM.4
🏄 [0:27:47.367248] Parsing output files
🏄 [0:28:10.411851] All done!
Okay, time to migrate this to collaborator's system
python3 setup.py install --user
python3 retrieve.py
cd data; bash opendata.sh
module load mpi/openmpi-x86_64
mpirun -np 32 python3 mangolin.py data/virusseq.2021-10-21T19\:59\:06.fasta.xz
head -n1 mangolin.0.csv > combined.csv && tail -n+2 -q mangolin.*.csv >> data/combined.csv
rm mangolin.*.csv
Successful run with:
nohup python3 process.py -np 64 --limit 100000 data/virusseq.2021-10-21T19\:59\:06.fasta.xz data/combined.csv data/virusseq.2021-10-21T19\:59\:06.metadata.tsv.gz data/sequences.fasta.xz data/metadata.tsv.gz &
Next do full run.
Modified pipeline to incorporate Pangolin MPI processing, but this requires the conda environment to be active - hence, CoVizu must also be installed in that environment:
🏄 [0:10:34.872575] Recoding features, compressing variants..
🏄 [0:10:35.073721] start MPI on minor lineages
Traceback (most recent call last):
File "covizu/clustering.py", line 13, in <module>
from covizu.utils.progress_utils import Callback
ModuleNotFoundError: No module named 'covizu'
...
subprocess.CalledProcessError: Command '['mpirun', '-np', '12', 'python3', 'covizu/clustering.py', 'recoded.json', 'minor_lineages.txt', '--mode', 'flat', '--max-variants', '5000', '--nboot', '100', '--outdir', 'data/', '--binpath', 'rapidnj', '--timestamp', '1634940691.412253']' returned non-zero exit status 1.
Can't get this to work even if CoVizu is installed in pangolin
environment - calls to covizu
from subprocess fail as well. Might be able to bypass this by passing shell=True
, but this is ridiculous. For now using workaround of pre-processing Pangolin.
art@orolo:~/git/covizu$ conda activate pangolin
(pangolin) art@orolo:~/git/covizu$ mpirun -np 8 python3 covizu/utils/pangolin.py data/virusseq.2021-10-16T12\:37\:23.fasta.xz data/virusseq.2021-10-16T12\:37\:23.pango
(5/8) processing 1000 sequences
(3/8) processing 1000 sequences
(pangolin) art@orolo:~/git/covizu$ head -n1 data/virusseq.2021-10-16T12\:37\:23.pango.0.csv > data/virusseq.2021-10-16T12\:37\:23.pango.csv && tail -n+2 -q data/virusseq.2021-10-16T12\:37\:23.pango.*.csv >> data/virusseq.2021-10-16T12\:37\:23.pango.csv
(pangolin) art@orolo:~/git/covizu$ rm data/virusseq.2021-10-16T12\:37\:23.pango.*.csv
(pangolin) art@orolo:~/git/covizu$ conda deactivate
art@orolo:~/git/covizu$ python3 process.py --vsfasta data/virusseq.2021-10-16T12\:37\:23.fasta.xz \
--vspango data/virusseq.2021-10-16T12\:37\:23.pango.csv \
--vsmeta data/virusseq.2021-10-16T12\:37\:23.metadata.tsv.gz \
--opfasta data/sequences.fasta.xz --opmeta data/metadata.tsv.gz --outdir data --ft2bin fasttree2-mp \
--limit 10000 --np 8
edit: forgot the -np 8
, run was in serial mode
Successful full run on server:
$ nohup python3 process.py -np 64 --outdir data --vsfasta data/virusseq.2021-10-21T19\:59\:06.fasta.xz \
--vspango data/combined.csv --vsmeta data/virusseq.2021-10-21T19\:59\:06.metadata.tsv.gz \
--opfasta data/sequences.fasta.xz --opmeta data/metadata.tsv.gz &
...
🏄 [3:04:21.081654] start D.2, 9013 entries
🏄 [3:05:04.203964] start B.1.177, 25989 entries
🏄 [3:08:54.529976] Parsing output files
🏄 [3:54:18.225675] All done!
Ok now we just need to automate this
I've generated data files from a full run, need to upload these to OICR data bucket so they can run some tests with vsportal
front end.
Backend running every three days
Still waiting on VirusSeq portal API documentation