eftsung / pygr

Automatically exported from code.google.com/p/pygr
0 stars 0 forks source link

unable to access the sequence object via a saved bound schema attribute :( #56

Closed GoogleCodeExporter closed 8 years ago

GoogleCodeExporter commented 8 years ago
What steps will reproduce the problem?

qing@1[ensembl]$ python -i
Python 2.5.2 (r252:60911, Aug  8 2008, 09:22:44)
[GCC 4.3.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import pygr.Data
>>> pygr.Data.dir('Bio.Annotation.Ensembl')
['Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon.sqltable',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exonTranscript',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript.sqltable']
>>> from pygr import sqlgraph
>>> from pygr import seqdb
>>> conn = pygr.Data.Bio.Server.Ensembl.Ensembldb()
>>> exonTB =
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon.sqltable()
>>> transcriptTB =
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript.sqltable()
>>> genomeName = 'HUMAN.hg18'
>>> genomeResourceID = 'Bio.Seq.Genome.' + genomeName
>>> genomeResourceID
'Bio.Seq.Genome.HUMAN.hg18'
>>> genome = pygr.Data.getResource(genomeResourceID)
>>> genome['chr1']
chr1[0:247249719]
>>> from ensembl import seqregion
>>> dnaTB = sqlgraph.SQLTable('homo_sapiens_core_47_36i.dna',
itemSliceClass=seqdb.SeqDBSlice, attrAlias=dict(seq='sequence'),
itemClass=seqregion.EnsemblDNA, serverInfo=conn)
>>> dnaTB.__doc__ = 'ensembl dna sql table (homo_sapiens_core_47_36i)'
>>>
pygr.Data.addResource('Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.dna.sqlta
ble',
dnaTB)
>>> seqregionTB = sqlgraph.SQLTable('homo_sapiens_core_47_36i.seq_region',
itemClass=sqlgraph.TupleO, serverInfo=conn)
>>> sr = seqregion.SeqRegion(seqregionTB, {17:genome, 4:dnaTB}, {17:'chr',
4:None})
>>> srID = 'Bio.Seq.Ensembl.homo_sapiens_core_47_36i.seq'
>>> sr.__doc__ = 'ensembl seqregion (homo_sapiens_core_47_36i)'
>>> pygr.Data.addResource(srID, sr)
>>> pygr.Data.save()
>>>

qing@1[ensembl]$ python -i
Python 2.5.2 (r252:60911, Aug  8 2008, 09:22:44)
[GCC 4.3.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import pygr.Data
>>> pygr.Data.dir('Bio.Annotation.Ensembl.homo_sapiens_core_47_36i')
['Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.dna.sqltable',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon.sqltable',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exonTranscript',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript.sqltable']
>>> conn = pygr.Data.Bio.Server.Ensembl.Ensembldb()
>>> exonTB =
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon.sqltable()
>>> transcriptTB =
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript.sqltable()
>>> sr = pygr.Data.Bio.Seq.Ensembl.homo_sapiens_core_47_36i.seq()
>>> from pygr import seqdb

>>> exonAnnoDB = seqdb.AnnotationDB(exonTB, sr,
sliceAttrDict=dict(id='seq_region_id', stop='seq_region_end',
orientation='seq_region_strand'))
>>> transcriptAnnoDB =seqdb.AnnotationDB(transcriptTB, sr,
sliceAttrDict=dict(id='seq_region_id', stop='seq_region_end',
orientation='seq_region_strand'))

>>> exonAnnoDB.__doc__ = 'ensembl annotationDB for exon
(homo_sapiens_core_47_36i)'
>>> transcriptAnnoDB.__doc__ = 'ensembl annotationDB for transcript
(homo_sapiens_core_47_36i)'
>>>
pygr.Data.addResource('Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon',
exonAnnoDB)
>>>
pygr.Data.addResource('Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcrip
t',
transcriptAnnoDB)
>>> from pygr import sqlgraph
>>> exon_transcript =
sqlgraph.SQLGraph('homo_sapiens_core_47_36i.exon_transcript',
serverInfo=conn, sourceDB = exonAnnoDB, targetDB=transcriptAnnoDB,
attrAlias=dict(source_id='exon_id', target_id='transcript_id'))
>>> exon_transcript.__doc__='ensembl exonAnnoDB -> transcriptAnnoDB
(homo_sapiens_core_47_36i)'

>>>
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon_transcript =
exon_transcript
>>>
pygr.Data.schema.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon_transcript
= pygr.Data.ManyToManyRelation(exonAnnoDB, transcriptAnnoDB,
bindAttrs=('transcripts', 'exons'))
>>> pygr.Data.save()
>>>

qing@1[ensembl]$ python -i
Python 2.5.2 (r252:60911, Aug  8 2008, 09:22:44)
[GCC 4.3.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import pygr.Data
>>> pygr.Data.dir('Bio.Annotation.Ensembl')
['Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.dna.sqltable',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon.sqltable',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exonTranscript',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon_transcript',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript',
'Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript.sqltable']
>>> exonDB = pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon()

>>> transcriptDB =
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript()
>>> transcript = transcriptDB[1]
>>> exons = transcript.exons
>>> for e in exons:
...     print e.id, e.seq_region_start
...
1 19397
2 14600
3 8131
4 7778
5 7465
6 7096
7 6721
8 6611
9 6470
10 5767
11 5659
12 4863
13 4274

# I didn't spell *sequence* wrong this time!
>>> for e in exons:
...     print e.id, len(e.sequence)
...
1
Traceback (most recent call last):
  File "<stdin>", line 2, in <module>
AttributeError: 'EnsemblRow_homo_sapiens_core_47_36i.exon' object has no
attribute 'sequence'

# An exon object does have a *sequence* attribute!
>>> exon = exonDB[1]
>>> len(exon.sequence)
273
>>> repr(exon)
'annot1[0:273]'

# In addition, I am able to access the sequence attribute of an exon via
the saved exon_transcript graph.
>>> exon_transcript =
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.exon_transcript()
>>> exons = (~exon_transcript)[transcript]
>>> for e in exons:
...     print e.id, e.seq_region_start, len(e.sequence)
...
1 19397 273
2 14600 155
3 8131 99
4 7778 147
5 7465 141
6 7096 132
7 6721 198
8 6611 18
9 6470 139
10 5767 44
11 5659 106
12 4863 39
13 4274 92
>>>                                                                       

Original issue reported on code.google.com by jqian....@gmail.com on 22 Dec 2008 at 6:09

GoogleCodeExporter commented 8 years ago
I'm not able to reproduce this problem.  Here's what I tried: I combined all the
initial setup steps listed in this report into a single file, issue56.py 
(attached),
which saves the relevant ensembl table & annotation info into pygr.Data.  I then
followed the steps to reproduce the bug, but didn't encounter the reported 
problem:

>>> import pygr.Data
>>> transcriptDB = 
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript()
>>> transcript = transcriptDB[1]
>>> exons = transcript.exons
>>> for e in exons: print e.id, e.seq_region_start
... 
1 19397
2 14600
3 8131
4 7778
5 7465
6 7096
7 6721
8 6611
9 6470
10 5767
11 5659
12 4863
13 4274
>>> for e in exons: print e.id, len(e.sequence)
... 
1 272
2 154
3 98
4 146
5 140
6 131
7 197
8 17
9 138
10 43
11 105
12 38
13 91
>>> for e in exons: print e.id, str(e.sequence)
... 
1
GGAAAGCGGGTCAAGGCGTAGGGCTGGAGGGCAGGGGCGGGCCCTGGGCGTGGGCTGGGGGTCCTGCCCCGGGGCGCACC
CCGGGCGAGGGCTGCCCGGAGGAGCCGAGGTTGGCGGACAGCTTGGCCCTGAGCTTGAGGGGAAGGCAGCGATGGGACAA
AGGACGGAGGTCTAGGAAGAGGGTCTGCAGAGCAGAAAGCACGGGTAGGGGCGGCCTGACGCTCGGAAGACAACGCATGG
GAGCCGTGTGCACGTCGGGAGCTCGGAGTGAG
2
GCACCATGACTCCTGTGAGGATGCAGCACTCCCTGGCAGGTCAGACCTATGCCGTGCCCTTCATCCAGCCAGACCTGCGG
CGAGAGGAGGCCGTCCAGCAGATGGCGGATGCCCTGCAGTACCTGCAGAAGGTCTCTGGAGACATCTTCAGCAG
3
GTAGAGCAGAGCCGGAGCCAGGTGCAGGCCATTGGAGAGAAGGTCTCCTTGGCCCAGGCCAAGATTGAGAAGATCAAGGG
CAGCAAGAAGGCCATCAA
4
GTGTTCTCCAGTGCCAAGTACCCTGCTCCAGGGCGCCTGCAGGAATATGGCTCCATCTTCACGGGCGCCCAGGACCCTGG
CCTGCAGAGACGCCCCCGCCACAGGATCCAGAGCAAGCACCGCCCCCTGGACGAGCGGGCCCTGCA
5
GAGAAGCTGAAGGACTTTCCTGTGTGCGTGAGCACCAAGCCGGAGCCCGAGGACGATGCAGAAGAGGGACTTGGGGGTCT
TCCCAGCAACATCAGCTCTGTCAGCTCCTTGCTGCTCTTCAACACCACCGAGAACCTGTA
6
AAGAAGTATGTCTTCCTGGACCCCCTGGCTGGTGCTGTAACAAAGACCCATGTGATGCTGGGGGCAGAGACAGAGGAGAA
GCTGTTTGATGCCCCCTTGTCCATCAGCAAGAGAGAGCAGCTGGAACAGCA
7
GTCCCAGAGAACTACTTCTATGTGCCAGACCTGGGCCAGGTGCCTGAGATTGATGTTCCATCCTACCTGCCTGACCTGCC
CGGCATTGCCAACGACCTCATGTACATTGCCGACCTGGGCCCCGGCATTGCCCCCTCTGCCCCTGGCACCATTCCAGAAC
TGCCCACCTTCCACACTGAGGTAGCCGAGCCTCTCAA
8 ACCTACAAGATGGGGTa
9
acaccacccccaccgcccccaccaccacccccaGCTCCTGAGGTGCTGGCCAGTGCACCCCCACTCCCACCCTCAACCGC
GGCCCCTGTAGGCCAAGGCGCCAGGCAGGACGACAGCAGCAGCAGCGCGTCTCCTTCA
10 TCCAGGGAGCTCCCAGGGAAGTGGTTGACCCCTCCGGTGGCTG
11
ACTCTGCTAGAGTCCATCCGCCAAGCTGGGGGCATCGGCAAGGCCAAGCTGCGCAGCATGAAGGAGCGAAAGCTGGAGAA
GCAGCAGCAGAAGGAGCAGGAGCAA
12 TGAGAGCCACGAGCCAAGGTGGGCACTTGATGTCGGAT
13
TGCTCCATGGGGGGACGGCTCCACCCAGCCTGCGCCACTGTGTTCTTAAGAGGCTTCCAGAGAAAACGGCACACCAATCA
ATAAAGAACTG
>>> e.__class__
<class 'pygr.classutil.AnnotationSeq_exon'>

One possible difference vs. Jenny's test environment is that I am using my 
original
ensembl/seqregion.py module.  I also tested using her September pyensembl 
package's
seqregion.py, and got the same result.  I don't see how differences in this file
could affect this test result anyway.  But it would probably be a good idea if 
Jenny
could test the same setup (i.e. rm .pygr_data to ensure you're starting with a 
clean
slate; run issue56.py to create entries in .pygr_data; run the steps above to 
test
e.sequence), using her latest pyensembl seqregion.py or whatever pyensembl 
version
she was using when she originally encountered the problem.  And of course, 
please
first get the latest pygr code from the git repository.

I did encounter (and fix) one subtle schema problem.  Titus added code to
AnnotationDB.__init__ that tries to load one annotation, just to make sure that 
the
user has defined all the database connections correctly (commit ID fe22781). 
However, this caused a very subtle problem: this loaded a single object (with 
key
value 1) into the cache, PRIOR to pygr.Data.ResourceFinder.applySchema() 
changing the
itemClass to accept schema bindings (like the "exons" attribute).  Therefore 
this one
object lacks the "exons" attribute.  It just so happened that Jenny also chose
transcript[1] as her example, so that this key (and only this key) would fail 
to have
the "exons" attribute.  I fixed the problem by changing Titus' code to NOT cache
anything as a side-effect of its check.

Original comment by cjlee...@gmail.com on 7 Jan 2009 at 3:35

Attachments:

GoogleCodeExporter commented 8 years ago
Another possible difference between Jenny's test code and my test code is that 
she
left out a required field (start) from the sliceAttrDict for both exonAnnoDB and
transcriptAnnoDB.  It's possible that leaving out that field caused the error 
she
reported, since that would indeed screw up its ability to access the sequence 
for the
exon annotation.

Specifically, she had:
exonAnnoDB = seqdb.AnnotationDB(exonTB, sr,
                                sliceAttrDict=dict(id='seq_region_id',
                                                   stop='seq_region_end',
                                                   orientation='seq_region_strand'))

whereas it should be:
exonAnnoDB = seqdb.AnnotationDB(exonTB, sr,
                                sliceAttrDict=dict(id='seq_region_id',
                                                   start='seq_region_start',
                                                   stop='seq_region_end',
                                                   orientation='seq_region_strand'))

This may very well be the explanation of the error message she reported.  Jenny 
could
test that by seeing if removing the start entry from the sliceAttrDict in my
issue56.py code will cause the error to reproduce again...  If so, perhaps we 
should
improve the error message provided in this case; it should specifically tell 
the user
that the required start attribute was missing.

Original comment by cjlee...@gmail.com on 8 Jan 2009 at 2:29

GoogleCodeExporter commented 8 years ago
I removed the start entry from the sliceAttrDict in Chris's issue56.py as 
suggested.
 I got the following error message:
Traceback (most recent call last):
  File "issue56_nostart.py", line 36, in <module>
    orientation='seq_region_strand'))
  File "/home/qing/workspace2008/pygr/pygr/seqdb.py", line 490, in __init__
    self.get_annot_obj(k, self.sliceDB[k]) # valid annotation object?
  File "/home/qing/workspace2008/pygr/pygr/seqdb.py", line 527, in get_annot_obj
    start = int(self.getSliceAttr(sliceInfo,'start'))
  File "/home/qing/workspace2008/pygr/pygr/seqdb.py", line 520, in getSliceAttr
    return getattr(sliceInfo,attr) # GET ATTRIBUTE AS USUAL
AttributeError: 'TupleO_homo_sapiens_core_47_36i.exon' object has no attribute 
'start'

WARNING: saving pygr.Data pending data that you forgot to save...
Remember in the future, you must issue the command pygr.Data.save() to save
your pending pygr.Data resources to your resource database(s), or alternatively
pygr.Data.rollback() to dump those pending data without saving them.
It is a very bad idea to rely on this automatic attempt to save your
forgotten data, because it is possible that the Python interpreter
may never call this function at exit (for details see the atexit module
docs in the Python Library Reference).

However, if I explicitly override the default itemClass (TupleO) with
seqregion.EnsemblRow, when constructing an sqlgraph.SQLTable object, then 
everything
is fine again.

exonTB = sqlgraph.SQLTable('homo_sapiens_core_47_36i.exon',
itemClass=seqregion.EnsemblRow, serverInfo=conn)

transcriptTB = sqlgraph.SQLTable('homo_sapiens_core_47_36i.transcript',
itemClass=seqregion.EnsemblRow, serverInfo=conn)

exonAnnoDB = seqdb.AnnotationDB(exonTB, sr,
                                sliceAttrDict=dict(id='seq_region_id',
                                                   stop='seq_region_end',
                                                   orientation='seq_region_strand'))

transcriptAnnoDB =seqdb.AnnotationDB(transcriptTB, sr,
                                     sliceAttrDict=dict(id='seq_region_id',
                                                        stop='seq_region_end',
                                                      orientation='seq_region_strand'))

In this case, I don't need to supply the *start* field when constructing an
Annotation object, because it has been defined in the seqregion.EnsemblRow 
class as
seq_region_start - 1, *not* seq_region_start.  Therefore, the potential cause 
of the
bug indicated in Comment 2 is not the real reason.

Note, I did check out the latest code from git before I performed all these 
tests. 
Maybe something magical in the latest code killed the bug :)

qing@1[ensembl]$ time python issue56_itemClass_nostart.py

real    0m22.228s
user    0m0.838s
sys     0m0.096s

qing@1[ensembl]$ python -i
Python 2.5.2 (r252:60911, Nov 14 2008, 19:46:32)
[GCC 4.3.2] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import pygr.Data
>>> transcriptDB = 
pygr.Data.Bio.Annotation.Ensembl.homo_sapiens_core_47_36i.transcript()
>>> transcript = transcriptDB[1]
>>> exons = transcript.exons
>>> for e in exons: print e.id, e.seq_region_start
...
1 19397
2 14600
3 8131
4 7778
5 7465
6 7096
7 6721
8 6611
9 6470
10 5767
11 5659
12 4863
13 4274
>>> for e in exons: print e.id, len(e.sequence)
...
1 273
2 155
3 99
4 147
5 141
6 132
7 198
8 18
9 139
10 44
11 106
12 39
13 92
>>> for e in exons: print e.id, str(e.sequence)
...
1
GGAAAGCGGGTCAAGGCGTAGGGCTGGAGGGCAGGGGCGGGCCCTGGGCGTGGGCTGGGGGTCCTGCCCCGGGGCGCACC
CCGGGCGAGGGCTGCCCGGAGGAGCCGAGGTTGGCGGACAGCTTGGCCCTGAGCTTGAGGGGAAGGCAGCGATGGGACAA
AGGACGGAGGTCTAGGAAGAGGGTCTGCAGAGCAGAAAGCACGGGTAGGGGCGGCCTGACGCTCGGAAGACAACGCATGG
GAGCCGTGTGCACGTCGGGAGCTCGGAGTGAGC
2
GCACCATGACTCCTGTGAGGATGCAGCACTCCCTGGCAGGTCAGACCTATGCCGTGCCCTTCATCCAGCCAGACCTGCGG
CGAGAGGAGGCCGTCCAGCAGATGGCGGATGCCCTGCAGTACCTGCAGAAGGTCTCTGGAGACATCTTCAGCAGG
3
GTAGAGCAGAGCCGGAGCCAGGTGCAGGCCATTGGAGAGAAGGTCTCCTTGGCCCAGGCCAAGATTGAGAAGATCAAGGG
CAGCAAGAAGGCCATCAAG
4
GTGTTCTCCAGTGCCAAGTACCCTGCTCCAGGGCGCCTGCAGGAATATGGCTCCATCTTCACGGGCGCCCAGGACCCTGG
CCTGCAGAGACGCCCCCGCCACAGGATCCAGAGCAAGCACCGCCCCCTGGACGAGCGGGCCCTGCAG
5
GAGAAGCTGAAGGACTTTCCTGTGTGCGTGAGCACCAAGCCGGAGCCCGAGGACGATGCAGAAGAGGGACTTGGGGGTCT
TCCCAGCAACATCAGCTCTGTCAGCTCCTTGCTGCTCTTCAACACCACCGAGAACCTGTAT
6
AAGAAGTATGTCTTCCTGGACCCCCTGGCTGGTGCTGTAACAAAGACCCATGTGATGCTGGGGGCAGAGACAGAGGAGAA
GCTGTTTGATGCCCCCTTGTCCATCAGCAAGAGAGAGCAGCTGGAACAGCAG
7
GTCCCAGAGAACTACTTCTATGTGCCAGACCTGGGCCAGGTGCCTGAGATTGATGTTCCATCCTACCTGCCTGACCTGCC
CGGCATTGCCAACGACCTCATGTACATTGCCGACCTGGGCCCCGGCATTGCCCCCTCTGCCCCTGGCACCATTCCAGAAC
TGCCCACCTTCCACACTGAGGTAGCCGAGCCTCTCAAG
8 ACCTACAAGATGGGGTac
9
acaccacccccaccgcccccaccaccacccccaGCTCCTGAGGTGCTGGCCAGTGCACCCCCACTCCCACCCTCAACCGC
GGCCCCTGTAGGCCAAGGCGCCAGGCAGGACGACAGCAGCAGCAGCGCGTCTCCTTCAG
10 TCCAGGGAGCTCCCAGGGAAGTGGTTGACCCCTCCGGTGGCTGG
11
ACTCTGCTAGAGTCCATCCGCCAAGCTGGGGGCATCGGCAAGGCCAAGCTGCGCAGCATGAAGGAGCGAAAGCTGGAGAA
GCAGCAGCAGAAGGAGCAGGAGCAAG
12 TGAGAGCCACGAGCCAAGGTGGGCACTTGATGTCGGATC
13
TGCTCCATGGGGGGACGGCTCCACCCAGCCTGCGCCACTGTGTTCTTAAGAGGCTTCCAGAGAAAACGGCACACCAATCA
ATAAAGAACTGA
>>>        

Original comment by jqian....@gmail.com on 8 Jan 2009 at 7:23