Open DaidalosCheung opened 5 years ago
from rdkit import Chem
import os
from rdkit.Chem import MACCSkeys
from sklearn.metrics import jaccard_similarity_score
sdf_file = "all.sdf"
com_file = "test.sdf"
# Read chemical molecules from the file
def get_molecules( chem_file ):
molecules = []
for mol in Chem.SDMolSupplier( chem_file ):
if mol is None:
continue
molecules.append(mol)
print( "Total " + str(len( molecules )) + "\n" )
return molecules
# Apply Jaccard method to evaluate the similarity between two list and output an nested LIST
def compare_jaccard( tar_fps, ref_fps ):
Jaccard_score = []
for i in range( len(tar_fps ) ):
Jaccard_score.append([])
for j in range( len(ref_fps) ):
Jaccard_score[i].append( jaccard_similarity_score( tar_fps[i], ref_fps[j], ) )
return Jaccard_score
# Calculate the similarity from two list of chemical molecules, and shows the results larger than threshold
def similarity_calculation( com_file, sdf_file, threshold ):
tar_molecules = get_molecules( com_file )
ref_molecules = get_molecules( sdf_file )
tar_fps = [list(MACCSkeys.GenMACCSKeys(x)) for x in tar_molecules]
ref_fps = [list(MACCSkeys.GenMACCSKeys(x)) for x in ref_molecules]
similarity_list = []
Jaccard_score = compare_jaccard( tar_fps, ref_fps )
result_file = open("Similarity_results.txt","w+")
for i in range(len( tar_fps ) ):
result_file.write( "\nThe similarity for " + str( tar_molecules[i].GetProp('DATABASE_ID')) + "\nwith %f above" % (threshold) + "\n" )
similarity_list.append([])
for j in range( len(ref_fps) ):
if ( Jaccard_score[i][j] > threshold):
result_file.write( str(Jaccard_score[i][j]) + "\t" + str( ref_molecules[j].GetProp('DATABASE_ID') ) + "\n" )
similarity_list[i].append( Jaccard_score[i][j] )
result_file.close()
return similarity_list;
################################### MAIN FUNCTION ###################################
# Calculate the fingerprint from reference & target file, output two figerprint list
threshold = float( input("Enter the similarity threshold: "))
result = similarity_calculation( com_file, sdf_file, threshold )
for i in range( len( result ) ):
for j in range( len( result[i] ) ):
print(result[i][j])
print("\n######################################################################\n")
squares = []
for x in range(10):
squares.append(x**2)
print squares
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
squares = [x**2 for x in range(10)]
print squares
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
https://www.pythonforbeginners.com/basics/list-comprehensions-in-python
First successful version
(No file output, no trace back to the reference function)