Open Mayurk619 opened 2 months ago
I made changes, I suggest changes in the parser code.
#!/usr/bin/env python
import pandas as pd
import argparse
"""
Parses annotation results from KEGG and optionally will pull in results from interproscan.
Assumes interproscan was run using the following flags: -f tsv --goterms --iprlookup --pathways.
"""
parser = argparse.ArgumentParser(description='Combines annotation Data for input to anvio')
parser.add_argument('--KeggDB', help='Identify the Kegg Orthology file (modified from htext using given bash script)')
parser.add_argument('-i', help='Specify the file containing GhostKoala Results')
parser.add_argument('--interproscan', help='Interproscan results')
parser.add_argument('-o', help='Specify an output file')
args = parser.parse_args()
arg_dict = vars(args)
keggortho_database = arg_dict['KeggDB']
output = arg_dict['o']
GK_results = arg_dict['i']
# Read in KO_Orthology file and format for downstream analysis
x = pd.read_table(keggortho_database, header=None, sep='\t')
# Split the last column into accession and description manually
accessions = []
descriptions = []
for item in x[3]:
parts = item.split(' ', 1)
accessions.append(parts[0])
descriptions.append(parts[1] if len(parts) > 1 else '')
x['accession'] = accessions
x['description'] = descriptions
# Drop the original column and set index
xy = x.drop(3, axis=1).set_index('accession')
xy.columns = ["Category1", "Category2", "Category3", "description"]
xy.to_csv("KeggOrthology_Table1.txt", encoding='utf-8')
# Process GhostKoala results
keggAnnotation = pd.read_table(GK_results, header=None, names=["gene_callers_id", "accession"], index_col=None)
keggAnnotation = keggAnnotation.replace({'genecall_': ''}, regex=True)
keggAnnotation = keggAnnotation.dropna().set_index("accession")
merged = keggAnnotation.join(xy)
merged_reduced = merged.drop_duplicates(subset='gene_callers_id', keep="last")
# Extract relevant information and format for output
extracted = merged_reduced.filter(['gene_callers_id', 'description', 'accession']).reset_index().set_index('gene_callers_id')
e_value = [0] * len(extracted['accession'].tolist())
source = ['KeggGhostKoala'] * len(extracted['accession'].tolist())
extracted.insert(0, 'source', source)
extracted.insert(3, 'e_value', e_value)
extracted = extracted.rename(columns={'description': 'function'}, index=str)
print(extracted.head())
if arg_dict["interproscan"] is not None:
interpro = pd.read_table(arg_dict["interproscan"], header=None, names=[
"gene_callers_id", "MD5", "Length", "source", "accession", "function",
"start_loc", "stop_loc", "e_value", "status", "date", "InterProAccession",
"InterProDescription", "GOAnnotations", "Pathway"])
InterProExtracted = interpro.filter(["gene_callers_id", "source", "accession", "function", "e_value"])
InterProExtracted = InterProExtracted.replace({'genecall_': ''}, regex=True)
InterProExtracted['e_value'] = InterProExtracted['e_value'].replace('-', 0)
InterProExtracted = InterProExtracted.set_index("gene_callers_id")
KEGG_InterPro_Combined = pd.concat([extracted, InterProExtracted])
KEGG_InterPro_Combined.to_csv(output, sep='\t')
else:
extracted.to_csv(output, sep='\t')
Hi,
When I run this command I'm getting type error because of this line y =pd.DataFrame(x[3].str.split(' ',1).tolist(),columns=['accession','description']), you can check the content of the files below, I don't know for whatever reason, the table looks slightly different than the one shown here.
$ python3 KEGG-to-anvio --KeggDB ../KO_Orthology_ko00001.txt -i ../user_ko.txt -o KeggAnnotations-AnviImportable.txt
$ cat ../KO_Orthology_ko00001.txt | head
$ cat ../user_ko.txt | head
Thanks!