zktuong / dandelion

dandelion - A single cell BCR/TCR V(D)J-seq analysis package for 10X Chromium 5' data
https://sc-dandelion.readthedocs.io/
GNU Affero General Public License v3.0
110 stars 25 forks source link

running without filtered h5 file #77

Closed avilella closed 3 years ago

avilella commented 3 years ago

Would it be possible to run the airr_rearrangement.tsv file based clustering and 3D network plotting without supplying a filtered h5 file?

If one only has data for the VDJ library and not the 5'GEX, this filtered h5 file wouldn't exist, yet the airr_rearrangement.tsv file for the VDJ data is available. See below the steps that currently work for both:

#!/usr/bin/env python
airr_file='/data/RUN/RUN021/X204SC20080363-Z01-F00N/L005/IM20p21-PET003-VDJ/IM20p21-PET003-VDJ_mlti/outs/vdj_b/airr_rearrangement.tsv'
filt_file='/data/RUN/RUN021/X204SC20080363-Z01-F00N/L005/IM20p21-PET003-VDJ/IM20p21-PET003-VDJ_mlti/outs/count/filtered_feature_bc_matrix.h5'

outputdir='/data/RUN/RUN021/X204SC20080363-Z01-F00N/L005/IM20p21-PET003-VDJ/IM20p21-PET003-VDJ_mlti/outs/vdj_b//ddli_images'

import os
import pandas as pd
os.chdir(os.path.expanduser('/home/avilella/dandelion/'))
import dandelion as ddl
import scanpy as sc

print("read_10x_airr")
vdj = ddl.read_10x_airr(airr_file)
print("end read_10x_airr")
print("read_10x_h5")
adata = sc.read_10x_h5(filt_file,gex_only=True)
print("end read_10x_h5")
adata.obs['sample_id'] = 'sample'
print("filter bcr")
vdj, adata = ddl.pp.filter_bcr(vdj, adata)
print("end filter bcr")

print("find_clones")
ddl.tl.find_clones(vdj)
print("end find_clones")

print("transfer")
ddl.tl.transfer(adata, vdj)
print("end transfer")

print("generate_network")
ddl.tl.generate_network(vdj, dim=3, key="sequence_alignment")
print("end generate_network")

import networkx as nx
import random
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

color_dict = {'unassigned':'#e7e7e7',
              'No_BCR':'#ffffff',
              'IgM':'#264653',
              'IgD':'#2A9D8F',
              'IgA':'#E9C369',
              'IgE':'#000000',
              'IgG':'#D7431D'
             }

# replace missing values
vdj.metadata['isotype'] = vdj.metadata['isotype'].replace(np.nan, 'No_BCR')
vdj.metadata['isotype'] = vdj.metadata['isotype'].replace('', 'unassigned')

## create a color dict mapping for the nodes
isotype = {x:color_dict[vdj.metadata.at[x, 'isotype']] for x in vdj.metadata.index}

# let's just do it in the expanded plot
layout = vdj.layout[1]
G = vdj.graph[1]

# extract layoyt positons
def extract_position(layout):
    pos = {l: (layout[l][0], layout[l][1], layout[l][2]) for l in layout}
    return(pos)

pos = extract_position(layout)

# add node attributes
for p in pos:
    G.nodes[p]['pos'] = pos[p]
for i in isotype:
    if i in G.nodes():
        G.nodes[i]['isotype'] = isotype[i]

# core function
def network_plot_3D(G, angle, col_dict, outputdir, save=False):
    # Get node positions
    pos = nx.get_node_attributes(G, 'pos')

    # Get the maximum number of edges adjacent to a single node
    edge_max = max([G.degree(i) for i in G.nodes()])
    # Define color range proportional to number of edges adjacent to a single node
    colors = col_dict 
    # 3D network plot
    with plt.style.context(('ggplot')):

        fig = plt.figure(figsize=(14,14))
        ax = Axes3D(fig)

        # Loop on the pos dictionary to extract the x,y,z coordinates of each node
        for key, value in pos.items():
            xi = value[0]
            yi = value[1]
            zi = value[2]

            # Scatter plot
            ax.scatter(xi, yi, zi, color=colors[key], s=50, edgecolors='k', alpha=1)
            ax.set_facecolor("white")

        # Loop on the list of edges to get the x,y,z, coordinates of the connected nodes
        # Those two points are the extrema of the line to be plotted
        for i,j in enumerate(G.edges()):
            x = np.array((pos[j[0]][0], pos[j[1]][0]))
            y = np.array((pos[j[0]][1], pos[j[1]][1]))
            z = np.array((pos[j[0]][2], pos[j[1]][2]))

        # Plot the connecting lines
            ax.plot(x, y, z, c='black', alpha=0.5)
        ax.set_xlim(-1, 1)
        ax.set_ylim(-1, 1)
        ax.set_zlim(-1, 1)
        fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
        bbox = fig.bbox_inches.from_bounds(1, 1, 12, 12)

    # Set the initial view
    ax.view_init(30, angle)
    # Hide the axes
    ax.set_axis_off()
    if save == True:
        plt.savefig(outputdir+str(angle).zfill(3)+".png", bbox_inches=bbox, dpi=720)
    else:
        plt.show()
    return

from tqdm import tqdm
from joblib import Parallel, delayed

# plot and save out!
# for k in tqdm(range(0,360,1)):
#     angle = k
#     network_plot_3D(G, angle, col_dict = isotype, save=True)

Parallel(n_jobs=8)(delayed(network_plot_3D)(G, i, col_dict = isotype, outputdir=outputdir, save = True) for i in tqdm(range(0,360,1)))
#Parallel(n_jobs=8)(delayed(network_plot_3D)(G, i, save = True) for i in tqdm(range(0,360,1)))

#sc.set_figure_params(figsize = [4,4])
#ddl.pl.clone_network(adata, 
#                     color = ['sample_id'], 
#                     edges_width = 1,
#                     size = 50) 
#
import pdb; pdb.set_trace()
pass; # args.debug=False

1;
zktuong commented 3 years ago

Hi @avilella,

If I understood correctly, all you need to do is to create a dummy anndata object where the indices of .obs are matching to cell_ids in your airr file. If Dandelion doesn't initialise because there's no cell_ids or any other of the required columns, then you can just make it up e.g. the cell_ids can just be a unique barcode for each contig (like the sequence_id) if you dealing with bulk-level data.

import scanpy as sc
import scipy.sparse

# assuming your filtered dandelion object is called vdj
obs = pd.DataFrame(index = vdj.metadata.index)
n = vdj.metadata.shape[0]

 # just create a random matrix
adata = sc.AnnData(X = scipy.sparse.random(n, 100), obs = obs)

# this is just to populate the neighbors slot
sc.pp.neighbors(adata)

# then transfer
ddl.tl.transfer(adata, vdj)