ponnhide / pyCircos

python Circos
GNU General Public License v3.0
339 stars 66 forks source link

tickplot/barplot not respecting genome coordinates or value #21

Closed anfoss closed 1 year ago

anfoss commented 2 years ago

I have a viral genome for which I want a circos plot with the genes as outer circle, CDS (+/-) and an histogram/barplot for counts of novel orfs.

Looking at the data format in the example repo

seems that something like this should suffice

gene start end value novel
gene1 163 553 0.3 novel
gene2 516 1101 0.5 reported
gene3 1581 2103 0.1 novel

However, when I use this table to generate the figure:


Garc  = pycircos.Garc
    Gcircle = pycircos.Gcircle
    circle = Gcircle(figsize=(8,8)) 
    for gn in set(tots['gene']):
        subs = tots[tots['gene']==gn]        
        arc    = Garc(arc_id=gn, size=int(subs['end']), interspace=2, raxis_range=(935,985), labelposition=80, label_visible=False)
        circle.add_garc(arc) 
    circle.set_garcs(-65, 245)
    for arc_id in circle.garc_dict:
        circle.tickplot(arc_id, raxis_range=(985,1000), tickinterval=100000000, ticklabels=None) 
    arcdata_dict = collections.defaultdict(dict)
    tots['start'] = tots['start'].astype(int)-1
    for gn in set(tots['gene']):
        subs = tots[tots['gene']==gn]
        arcdata_dict[gn]['positions'] = list(subs['start'])
        arcdata_dict[gn]["widths"] = list(
            subs['start'].astype(int) - subs['end'].astype(int))
        arcdata_dict[gn]["values"] = list(subs['value'].astype(float))
        print(list(subs['value'].astype(float)))
    values_all = list(tots['value'])
    vmin, vmax = min(values_all), max(values_all)
      cols = ['white' if 'reported' in x else 'purple' for x in tots['novel']]
    for key in arcdata_dict:
        circle.barplot(key, data=arcdata_dict[key]["values"], positions=arcdata_dict[key]["positions"],
                        width=arcdata_dict[key]["widths"], base_value=0.0, rlim=[vmin-0.05*abs(vmin), vmax+0.05*abs(vmax)],
                        raxis_range=[500, 745], facecolor=cols, spine=True)

If I do

    for key in arcdata_dict:
        print(arcdata_dict[key]["values"], arcdata_dict[key]["widths"])

The various bars have different width and different heights.

see results here test.pdf

using a GB file as input as the tutorial #2

If I use a gb file as Garc the barplot is also not correct

    f    Garc = pycircos.Garc
    Gcircle = pycircos.Gcircle
    record = SeqIO.read(base, format="genbank")
    garc = Garc(arc_id=id, record=record, interspace=0, linewidth=0,
                facecolor="#FFFFFF00", raxis_range=(0, 10),
                label="my org", label_visible=True)
    gcircle = Gcircle()
    gcircle.add_garc(garc)
    gcircle.set_garcs()

    plus_CDS = []
    minus_CDS = []
    for feat in garc.record.features:
        if feat.type == "CDS" and feat.strand >= 0:
            plus_CDS.append(feat)
        elif feat.strand == -1:
            minus_CDS.append(feat)
    gcircle.featureplot(id, source=plus_CDS,
                        raxis_range=(700, 750), facecolor="tomato")
    gcircle.featureplot(id, source=minus_CDS,
                        raxis_range=(750, 800), facecolor="cornflowerblue")

    arcdata_dict = collections.defaultdict(dict)
    tots['start'] = tots['start'].astype(int)-1
    arcdata_dict[id]['positions'] = list(tots['start'])
    arcdata_dict[id]["widths"] = list(
        tots['start'].astype(int) - tots['end'].astype(int))
    arcdata_dict[id]["values"] = list(tots['value'].astype(float))
    values_all = list(tots['value'])

   # highlight novel ORFs
    cols = ['white' if 'reported' in x else 'purple' for x in tots['novel']]

    vmin, vmax = min(values_all), max(values_all)
    for key in arcdata_dict:
        gcircle.barplot(key, data=arcdata_dict[key]["values"], positions=arcdata_dict[key]["positions"],
                        width=arcdata_dict[key]["widths"], base_value=0.0, rlim=[vmin-0.05*abs(vmin), vmax+0.05*abs(vmax)],
                        raxis_range=[500, 745], facecolor=cols, spine=True)

Zooming in the barplot, there are black bars but no purple bars (while the CDS distribution looks correct).

test_CDS.pdf

    print(len([x for x in cols if x=='purple']))
   # print 4

Am I missing something fundamental here?

The input file is build using the same GenBank file so I am not sure why there are missing regions in the tick plot.