phiweger / zoo

A portable datastructure for rapid prototyping in (viral) bioinformatics (under development).
5 stars 2 forks source link

Bio.SeqRecord to/ from JSON conversion #90

Open phiweger opened 7 years ago

phiweger commented 7 years ago

Current solution is only one-way and hacky. Implement something nicer.

http://lists.open-bio.org/pipermail/biopython/2011-June/013558.html https://github.com/laserson/pytools/blob/master/seqtools.py#L209

convert Bio.SeqRecord to/ from JSON: http://biopython-dev.biopython.narkive.com/1SzbUvzS/json-formatting-of-seqrecord-objects see attachment:

from Bio import AlignIO
from Bio import SeqIO
from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO.Interfaces import SequentialSequenceWriter
import json

def SeqFromJSON(handle, alphabet = single_letter_alphabet, title2ids=None ):
    jsonObj = json.loads( handle.read() )
    if hasattr(jsonObj,'update'):
        # then it's a dict, so we've only one record.
        jsonSeq = jsonObj.pop('seq', '')
        yield SeqRecord( Seq( jsonSeq , alphabet), **jsonObj )
    else:
        # more than one record
        for jsonRecord in jsonObj:
            jsonSeq = jsonRecord.pop('seq', '')
            yield SeqRecord( Seq( jsonSeq , alphabet), **jsonRecord )

class SeqToJSON(SequentialSequenceWriter):
    features = ['type', 'location_operator', 'strand', 'id',
                'ref', 'ref_db', 'qualifiers']
    def convert_features(self, features):
        """features - a list of SeqFeature objects"""
        def _convert_feature( feature ):
            _feature = {}
            if hasattr( feature, 'location' ):
                location = feature.location
                attrs = ('strand', 'start', 'end',)
                location = dict((attr, getattr(location,attr)) for attr in attrs \
                        if hasattr( location, attr ))
                _feature.update( { 'location' : location } )
            for feat in self.features:
                if hasattr( features, feat ):
                    _feature.update( { feat : getattr(feature,feat) } )
            # recursively update subfeatures.
            if hasattr( feature, 'subfeatures' ):
                _feature.update( self.convert_features( \
                            getattr(feature, 'subfeatures' ) ) )
            return _feature

        return map( _convert_feature, features )
    def write_header(self):
        self.handle.write('[')
        self._header_written = True
    def write_footer(self):
        self.handle.write(']')
        self._footer_written = True
    attributes = ['id', 'name', 'description', 'dbxrefs',
                        'annotations', 'letter_annotations' ]
    def write_record(self, record):
        write = self.handle.write
        if self._record_written:
            write(',')
        seq = record.seq
        jsonSeq = {'seq' : seq.tostring() }
        if hasattr( record, 'features' ):
            jsonSeq.update( {'features' : 
                self.convert_features(getattr(record, 'features')) } )
        for attr in self.attributes:
            if hasattr(record, attr):
                jsonSeq.update( { attr : getattr( record, attr )})
        write( json.dumps(jsonSeq) )
        self._record_written = True
    def write_records(self, records):
        self.write_header()
        write_record = self.write_record
        map( write_record, records )
        self.write_footer()
        return len(records)

SeqIO._FormatToIterator.update(   {'json': SeqFromJSON})
AlignIO._FormatToIterator.update( {'json': SeqFromJSON})
SeqIO._FormatToWriter.update(     {'json': SeqToJSON  })
AlignIO._FormatToWriter.update(   {'json': SeqToJSON  })

if __name__ == '__main__':
    import cStringIO as StringIO
    from Bio import SeqIO
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    from Bio.Alphabet import generic_dna
    def testwrite():
        """Example of how to write sequences as a JSON
        string.
        """
        seq = SeqRecord( Seq( 'ACGTACGTACGTACGT',alphabet=generic_dna),
                    id = 'ABC123',
                    annotations = { 'species' : 'something or other' },
                    features = [
                        SeqFeature( location=FeatureLocation(start=0,end=3),
                            type='CDS',
                            id='a feature',
                            qualifiers = { 'a qual' : 'result' }
                        ),
                        SeqFeature( location=FeatureLocation(strand=-1,start=3,end=6),
                            type='CDS',
                            id='a feature',
                            qualifiers = { 'a qual' : 'result' }
                        )
                        ] )
        # StringIO used to replicate file-like object.
        #print 'Testing writing a SeqRecord object to json'
        json_seq = StringIO.StringIO()
        json_seq.write( seq.format('json' ) )
        #print 'genbank formatted sequence would look like:-'
        #print seq.format('genbank')
        json_seq.seek(0)
        #print 'json formatted version:-:'
        #print seq.format('json')
        return json_seq
    def testread():
        """ Example of how to read sequences from JSON object."""
        import re
        strip = re.compile('\s+')
        json_seqs = """\
            [{ 
                "id"   :  "ABC123",
                "name" : "nothing",
                "annotations" : { \
                        "source" : "species name"
                    },
                "seq"  : "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"
            },\
            { 
                "id"   :  "BCD234",
                "name" : "nothing",
                "annotations" : { \
                        "source" : "species name 2"
                    },
                "seq"  : "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT"
            }]\
            """
        # strip white space from string and load it into file-like object
        seq_handle = StringIO.StringIO(strip.sub('',json_seqs))
        # convert to a list of SeqRecord objects
        seqs = [seq for seq in SeqIO.parse( seq_handle, 'json' ) ]
        return seqs
    seq_handle = testwrite()
    #print seq_handle.read()
    seqrecords = testread()
    print seqrecords