Closed ZQY19960810 closed 2 years ago
Hi,
similarities[(similarities["SN"]>= sn_thr)].sort_values("z_ssim")
?Regarding the filtering thresholds, I advise against blindly using the 0.5 and -1 values. I recommend checking our update FAQ where we have some info about figuring out what thresholds could make sense for your data.
Thank for your answer, then I will retry with FAN-C .hic file
sorry,I retry use FAN-C file ,but It did't worked here are my some information,i think it will help you solve my problem chess pairs producted susScr11_chr1_5mb_win_200kbstep.bed FAN-C file: SAMN09691005.1.50k.KR.hic SAMN15870118.1.50k.KR.hic The resulting process generates : fanc map SAMN09691005.fastq.gz ~/Sus_scrofa.Sscrofa11.1.dna.chrlevel_index/Susscrofa.Sscrofa11.1 SAMN09691005.sam -t 16 --fanc-parallel --no-iterative -s 5 -q 30 -r HindIII --split-fastq samtools sort -n SAMN09691005*.sam fanc pairs SAMN09691005*.sorted.sam SAMN09691005.pairs fanc hic SAMN09691005.pairs -b 50k --chromosomes 1 --norm-method KR SAMN09691005.1.50k.KR.hic (SAMN15870118 same) then I runned chess.sin and chess.extract , they finished with no error I start use python to plot contact matrix here is my code
`import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import fanc
import fanc.plotting
from scipy import ndimage as ndi
import matplotlib.patches as patches
from scipy.ndimage import zoom
from mpl_toolkits.axes_grid1 import make_axes_locatable
#plt.rcParams['text.usetex'] = True
# plt.style.use(['dark_background'])
# plt.style.use('classic')
#%matplotlib inline
prop_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
SMALL_SIZE = 13
MEDIUM_SIZE = 15
BIGGER_SIZE = 20
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
winsize = "5mb"
wdir = "./"
chess_results_file = "SAMN15870118_vs_SAMN09691005_chr1_{}_chess_resluts.tsv".format(winsize)
region_pairs = "susScr11_chr1_{}_win_200kb_step.bed".format(winsize)
similarities = pd.read_csv(wdir + chess_results_file, sep='\t', index_col=0)
regions = pd.read_csv(wdir + region_pairs, sep='\t', header=None)
sim_field = "z_ssim"
sn_thr = 0.41
zsim_thr = -0.9
sub_sim = similarities[(similarities["SN"]>= sn_thr) & (similarities[sim_field]<= zsim_thr)]
all_X = regions.loc[similarities.index, 1:2].mean(axis=1).values / 10 ** 6
X = regions.loc[sub_sim.index, 1:2].mean(axis=1).values / 10 ** 6
S = sub_sim[sim_field]
SN = sub_sim["SN"]
plt.figure(figsize=(10, 3))
plt.plot(all_X, similarities[sim_field], ":", alpha=0.4)
plt.hlines(zsim_thr, 0, max(all_X), linestyle=":", color="red")
plt.scatter(all_X, similarities[sim_field], facecolors='none', edgecolors='grey', alpha=0.1, s=20)
plt.scatter(X, S, c=SN, marker='.')
plt.ylabel(sim_field.replace("_", "-"))
plt.xlabel("window midpoint [Mb]")
c = plt.colorbar()
c.set_label("signal/noise")
plt.tight_layout()
plt.savefig("SAMN15870118_vs_SAMN09691005_{}_chr1_chess_KR_results_track.png".format(winsize), dpi=250)
similarities[(similarities["SN"]>= sn_thr)].sort_values("z_ssim")
hic_patient = fanc.load("SAMN09691005.1.50k.KR.hic")
hic_control = fanc.load("SAMN15870118.1.50k.KR.hic")
region_id = 144
window_start, window_end = regions.loc[region_id][1:3]
region_string = "1:{}-{}".format(window_start, window_end)
patient_hic = hic_patient.matrix((region_string,region_string),oe=True)
control_hic = hic_control.matrix((region_string,region_string),oe=True)
print(region_string)
patient_region_sub = patient_hic[region_string, region_string].data
control_region_sub = control_hic[region_string, region_string].data
min_v = min(
[
np.min(np.extract(patient_region_sub>0 , patient_region_sub)),
np.min(np.extract(control_region_sub>0 , control_region_sub))
]
)
patient_region_sub += min_v
control_region_sub += min_v
l2fcm = np.log2(patient_region_sub / control_region_sub)
fig, axes = plt.subplots(1, 3, figsize=(9, 3))
axes[0].set_title('SAMN09691005')
axes[1].set_title('SAMN15870118')
axes[2].set_title('$log_2$(SAMN09691005/SAMN15870118)')
m1 = axes[0].imshow(patient_region_sub, norm=matplotlib.colors.LogNorm(), cmap='germany')
m2 = axes[1].imshow(control_region_sub, norm=matplotlib.colors.LogNorm(), cmap='germany')
m3 = axes[2].imshow(l2fcm, cmap='seismic', vmax=5, vmin=-5)
for m, ax in zip([m1, m2, m3], axes):
ax.axis('off')
divider = make_axes_locatable(ax)
cax = divider.append_axes('bottom', size='5%', pad=0.05)
fig.colorbar(m, cax=cax, orientation='horizontal')
plt.tight_layout()
plt.savefig("SAMN15870118_vs_SAMN09691005_5mb_chr1_chess_KR_region.png", dpi=250)
## obtaining regions of interest
regions2compare = regions.loc[sub_sim.index]
regions2compare.to_csv('filtered_SAMN15870118_vs_SAMN09691005_chr1_{}_chess_resluts.tsv'.format(winsize), '\t', index=False, header=False)
def clipped_zoom(img, zoom_factor, **kwargs):
h, w = img.shape[:2]
zoom_tuple = (zoom_factor,) * 2 + (1,) * (img.ndim - 2)
if zoom_factor < 1:
zh = int(np.round(h * zoom_factor))
zw = int(np.round(w * zoom_factor))
top = (h - zh) // 2
left = (w - zw) // 2
out = np.zeros_like(img)
out[top:top+zh, left:left+zw] = zoom(img, zoom_tuple, **kwargs)
elif zoom_factor > 1:
zh = int(np.round(h / zoom_factor))
zw = int(np.round(w / zoom_factor))
top = (h - zh) // 2
left = (w - zw) // 2
out = zoom(img[top:top+zh, left:left+zw], zoom_tuple, **kwargs)
trim_top = ((out.shape[0] - h) // 2)
trim_left = ((out.shape[1] - w) // 2)
out = out[trim_top:trim_top+h, trim_left:trim_left+w]
else:
out = img
return out
def highlight_features(dataframe, region, color, a, axes):
try:
features = dataframe.loc[region].values.tolist()
if type(features[0]) == int:
_, x_min, x_max, y_min, y_max = features
rect = patches.Rectangle((x_min,y_min),x_max-x_min,y_max-y_min,linewidth=1.2,
edgecolor=color, facecolor='none')
axes[a].add_patch(rect)
else:
for f in features:
_, x_min, x_max, y_min, y_max = f
rect = patches.Rectangle((x_min,y_min),x_max-x_min,y_max-y_min,linewidth=1.2,
edgecolor=color, facecolor='none')
axes[a].add_patch(rect)
except KeyError:
next
## load gained and lost features
gained = pd.read_csv(wdir + 'gained_features.tsv', delimiter=',', usecols=[0, 1, 2, 3,4, 5], header=None, index_col=[0])
lost = pd.read_csv(wdir + 'lost_features.tsv', delimiter=',', usecols=[0, 1, 2, 3, 4, 5], header=None, index_col=[0])
reg = 144
window_start, window_end = regions.loc[reg][1:3]
region_string = "1:{}-{}".format(window_start, window_end)
patient_region_sub = patient_hic[region_string, region_string].data
control_region_sub = control_hic[region_string, region_string].data
min_v = min(
[
np.min(np.extract(patient_region_sub>0 , patient_region_sub)),
np.min(np.extract(control_region_sub>0 , control_region_sub))
]
)
patient_region_sub += min_v
control_region_sub += min_v
l2fcm = np.log2(patient_region_sub / control_region_sub)
zml2 = clipped_zoom(l2fcm, 0.7)
rot_l2 = ndi.rotate(zml2, 45, reshape=False)
fig, axes = plt.subplots(1, 3, figsize=(16, 8))
axes[0].set_title('SAMN09691005')
axes[1].set_title('SAMN15870118')
axes[2].set_title('$log_2$(SAMN09691005 / SAMN15870118)')
# clipped zoom and rotate patient and control and keep only half-matrix
zm1 = clipped_zoom(control_region_sub, 0.7)
rot_control = ndi.rotate(zm1, 45, reshape=False)
zm2 = clipped_zoom(patient_region_sub, 0.7)
rot_patient = ndi.rotate(zm2, 45, reshape=False)
middle = int(np.shape(rot_control)[1]/ 2.)
m1 = axes[0].imshow(rot_patient[:middle, :], vmin=0, vmax=0.03, cmap='germany')
m2 = axes[1].imshow(rot_control[:middle,:], vmin=0, vmax=0.03, cmap='germany')
# per region check if identified features, to highlight
highlight_features(gained, reg, 'crimson', 0, axes)
highlight_features(lost, reg, 'royalblue', 1, axes)
m3 = axes[2].imshow(rot_l2[:middle,:], cmap='seismic', vmax=5, vmin=-5)
for m, ax in zip([m1, m2, m3], axes):
ax.axis('off')
divider = make_axes_locatable(ax)
cax = divider.append_axes('bottom', size='5%', pad=0.05)
fig.colorbar(m, cax=cax, orientation='horizontal')
plt.tight_layout()
plt.savefig("SAMN15870118_vs_SAMN09691005_5mb_chr1_KR_region_with_features.png", dpi=250)`
but my plot didn't finised , like this
Could you find out what I did wrong? This problem has been bothering me for some time,Thank you
What are the values in the matrices / regions you are plotting? There is a vmax=0.03, so everything exceeding that will be black.
Please tell me how to find the value as you said , I have seen the parameter vmax in "fancplot", but I don't know how to check it
Here in these lines vmax is set to 0.03.
m1 = axes[0].imshow(rot_patient[:middle, :], vmin=0, vmax=0.03, cmap='germany')
m2 = axes[1].imshow(rot_control[:middle,:], vmin=0, vmax=0.03, cmap='germany')
So if all values in rot_patient[:middle, :]
are > 0.03, the plot will be black.
You could look at the minimum value in rot_patient[:middle, :]
, or better at the distribution of the matrix entries.
unfortunatly, I use fancplot trying to find the threshold of vmax , but th value of vmax is very large compared to what your show in example , btw,my input file is produced from juicer pre command. here is my command and results
fancplot -o plot.png 1:1mb-8mb -p triangular -r inter_30.hic@50kb
so I noticed that you seemly didn't use KR in fancplot, and I try to use inter_30.hic@50kb@KR ,but didn't work
If I remember correctly, Juicer's version of Hi-C normalisation multiples the output values to get a final sum equal to the input number of valid read pairs, giving a value that can be interpreted as "normalised contacts". FAN-C's version of Hi-C normalisation does not, giving a value that can be interpreted as "normalised contact probability". Both are valid ways of representing the data, but of course you'll need to choose a colour scale threshold that's appropriate for your data (even for normalised contact probability 0.03 is not always the best threshold). Based on the plot above I'd suggest you see how it looks with vmax=500
and adjust from there.
Thank you! It solved my question.Thanks again
Best wishes
dear chesser: When I want to display this information from the results from chess sim and filtered it. So I copied your code from your jupyter notebook at examples/dlbcl/example_analysis.ipynb, now I have a few questions about that first , when I display only regions with a signal-to-noise ration > 0.5 and a z-normalised similarity score < -1 as colored dots, I noticed the units of your matrix 's x-asix from left to right is Negative power of 10 , but my units of x-asix shows Positive power of 10 , then I noticed that you used reference_contacts and query_contacts both used balanced matrix , so tried KR normalized method but didn't work, can you tell me how to fix it? BTW, my file was generateed from juicer. Second , I want to ask how input juicer .hic files to fanc.load(), my plot is whole black , but it can shows lost&gained information , should I input fanc .hic files? Finally, I didn't find your code about filtering with the signal-to-noise > 0.5 and z-ssim < -1 cutoffs(see the jupyter notebook for subsetting code),can you paste a link for me? Thanks