Closed erinyoung closed 1 year ago
@erinyoung Here is a python script for plotting pie charts for each sample. I used the '*.aggregate.tsv' output file which includes all the samples rather than the individual 'demix.tsv'. I built over some the existing freyja code (used their 'prepLineageDict' function) so it might be easier to incorporate into existing workflow. Let me know if this works for you.
# Import required packages
import pandas as pd
import re
import os
import glob
import matplotlib.pyplot as plt
# Function to prepare lineage dictionary
def prepLineageDict(agg_d0):
# Split the 'lineages' and 'abundances' fields by spaces
agg_d0.loc[:, 'lineages'] = agg_d0['lineages'].apply(lambda x: re.sub(' +', ' ', x).split(' ')).copy()
agg_d0.loc[:, 'abundances'] = agg_d0['abundances'].apply(lambda x: re.sub(' +', ' ', x).split(' ')).copy()
# Create 'linDict' column with mapped lineages and abundances values
agg_d0.loc[:, 'linDict'] = [
{lin: float(abund) for lin, abund in zip(agg_d0.loc[samp, 'lineages'], agg_d0.loc[samp, 'abundances'])}
for samp in agg_d0.index
]
return agg_d0 # Return the updated DataFrame
# Function to make pie charts for each sample
def makePieCharts_simple(agg_df, lineages, outputFnBase):
# Prepare the lineage dictionary if lineages is given
if lineages:
queryType = 'linDict'
agg_df = prepLineageDict(agg_d0)
else:
return
# Loop over all samples in the DataFrame
for k in range(0, agg_df.shape[0]):
# Extract the lineage dictionary for the current sample
dat = agg_df.iloc[k][queryType]
if isinstance(dat, list):
loc = pd.Series(dat[0])
else:
loc = pd.Series(dat)
# Create the pie chart
fig, ax = plt.subplots()
ax.pie(loc, labels=loc.index, autopct='%1.1f%%', shadow=True, startangle=90)
ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
# Use the sample name in the filename and the title
sample_name = agg_df.index[k]
plt.title(sample_name)
plt.savefig(outputFnBase + sample_name + '.png')
plt.close()
# Gather all '*lineages_aggregate.tsv' files in the current directory
source_files = glob.glob('*lineages_aggregate.tsv')
for file in source_files:
print(file)
# Read the current file into a DataFrame
df = pd.read_csv(file, sep="\t", index_col=0)
# Create pie charts for each sample in the DataFrame
makePieCharts_simple(df, 'lineages', 'freyja_sublin_pie_')
This looks great!
For each sample (of supposedly wastewater), create a pie chart for wastewater results. This makes it easy to convey proportions.