Closed ammaraziz closed 1 year ago
package requirement: pandas, biopython, os input ie. subtype = 'H3N2' passage_h = 'Egg' GENE = 'HA' filter_fasta = './sample.fasta' # output file name filter_csv = './output.csv' # output metafiles input_fasta = "./dump.fasta" # input file name input_csv = './example.csv' # input csv from fuzee
def rename(input_fasta, input_csv, filter_fasta, passage_h, subtype, GENE):
dump_seq = list(SeqIO.parse(input_fasta, "fasta"))
data = pd.read_csv(input_csv)
#param setting
gene = dict({ '4': 'HA',
'6': 'NA',
'1': 'PB2',
'2': 'PB1',
'3': 'PA',
'5': 'NP',
'7': 'MP' })
passage = dict({ 'q':'QMC',
'e': 'Egg',
'o': 'original_specimen',
'cs': 'clinical_specimen'})
#pandas operation
#seperate data to new column for extracting the gene id
data[['Seq','Gene_id']] = data['Seq No'].str.split('.',expand=True)
#map gene id with name and write it to a new column
data['Gene'] = data['Gene_id'].map(gene)
#extract month and convert into the String (ie. jan, feb, etc.)
data['Month'] = pd.to_datetime(data['Sample Date']).dt.strftime('%b').str.lower()
#concatenate a designation and month to create the name using in a phylogenetic tree
data['name'] = data['Designation'].str.replace(" ", "_") + '_' + data['Month']
#find the pattern and map the passage history
data['Passage type'] = data['Passage History'].astype(str).str[0].str.lower().map(passage)
#filter only the row that contains information about Designation, HA clade, Sub Type, and Passage History
processed_data = data.loc[(data['Designation'].isnull() + data['HA Clade'].isnull() + data['Sub Type'].isnull()+ data['Passage History'].isnull()) ==0 ]
#filter only the row that match with our query
processed_data = processed_data.loc[processed_data['Sub Type'] == subtype]
processed_data = processed_data.loc[processed_data['Passage type'] == passage_h]
processed_data = processed_data.loc[processed_data['Gene'] == GENE]
# use the info from data to subset only sequences you want
subset_fasta = []
for seq in dump_seq:
if seq.id in list(processed_data['Seq No']):
subset_fasta.append(seq)
#iterate through them to rename and write into new fasta file
with open(filter_fasta, 'w') as f_fasta:
for seq in subset_fasta:
name = list(processed_data.loc[processed_data['Seq No'] == seq.id, 'name'])[0]
seq.id = name
seq.description = name
SeqIO.write(seq, f_fasta, 'fasta')
#write clean metafile
colname = ['Seq No','Lab #','Lineage', 'Designation',' Passage History' , 'Sample Date',' Results', 'GISAID #','HA Clade', 'name']
processed_data[colname].to_csv(filter_csv,index=False)
Thanks @aradahir that's great work. I'll work on integrating it.
This has been combined with #6
Current method for renaming fasta files is very repetitive and cumbersome. The process is:
This should be automated. New subcommand:
rename
.Example:
Input:
meta.tsv
contents:seqno, passage, gene, date, designation
.Output:
{designation}{passage_short}_{month_abbr}
.Potential additions:
--include-month
,--include-passage