DaidalosCheung / Python

0 stars 0 forks source link

Chem_Comparing #6

Open DaidalosCheung opened 5 years ago

DaidalosCheung commented 5 years ago

First successful version

(No file output, no trace back to the reference function)

from rdkit import Chem
import os
from rdkit.Chem import MACCSkeys

from sklearn.metrics import jaccard_similarity_score

sdf_file = "all.sdf"
com_file = "test.sdf"

# Read chemical molecules from the file
def get_molecules( chem_file ):

    molecules = []

    for mol in Chem.SDMolSupplier( chem_file ):
        if mol is None: 
            continue
        molecules.append(mol)

    print(len(molecules))

    return molecules

# Apply Jaccard method to evaluate the similarity between two list and output an nested LIST
def compare_jaccard( tar_fps, ref_fps ):
        Jaccard_score = []
        for i in range( len(tar_molecules) ):
                Jaccard_score.append([])
                for j in range( len(ref_fps) ):
                        Jaccard_score[i].append( jaccard_similarity_score( tar_fps[i], ref_fps[j], ) )

        return Jaccard_score

# Calculate the similarity from two list of chemical molecules, and shows the results larger than threshold
def similarity_calculation( tar_fps, ref_fps, threshold):
        similarity_list = []
        Jaccard_score = compare_jaccard( tar_fps, ref_fps )

        for i in range(len( tar_fps ) ):
                similarity_list.append([])
                for j in range( len(ref_fps) ):
                        if ( Jaccard_score[i][j] > threshold):
                                similarity_list[i].append( Jaccard_score[i][j] )

        return similarity_list;

################################### MAIN FUNCTION ###################################

# Calculate the fingerprint from reference & target file, output two figerprint list
threshold = float( input("Enter the similarity threshold: "))
tar_molecules = get_molecules( com_file )
ref_molecules = get_molecules( sdf_file )

tar_fps = [list(MACCSkeys.GenMACCSKeys(x)) for x in tar_molecules]
ref_fps = [list(MACCSkeys.GenMACCSKeys(x)) for x in ref_molecules]

result = similarity_calculation( tar_fps, ref_fps, threshold)

for i in range(len(result) ):
        for j in range( len(result[i] ) ):
                print(result[i][j])
        print("\n######################################################################\n")
DaidalosCheung commented 5 years ago

Final version with file output

from rdkit import Chem
import os
from rdkit.Chem import MACCSkeys
from sklearn.metrics import jaccard_similarity_score

sdf_file = "all.sdf"
com_file = "test.sdf"

# Read chemical molecules from the file
def get_molecules( chem_file ):

    molecules = []

    for mol in Chem.SDMolSupplier( chem_file ):
        if mol is None: 
            continue
        molecules.append(mol)

    print( "Total " + str(len( molecules )) + "\n" )

    return molecules

# Apply Jaccard method to evaluate the similarity between two list and output an nested LIST
def compare_jaccard( tar_fps, ref_fps ):
        Jaccard_score = []
        for i in range( len(tar_fps ) ):
                Jaccard_score.append([])
                for j in range( len(ref_fps) ):
                        Jaccard_score[i].append( jaccard_similarity_score( tar_fps[i], ref_fps[j], ) )

        return Jaccard_score

# Calculate the similarity from two list of chemical molecules, and shows the results larger than threshold
def similarity_calculation( com_file, sdf_file, threshold ):
        tar_molecules = get_molecules( com_file )
        ref_molecules = get_molecules( sdf_file )

        tar_fps = [list(MACCSkeys.GenMACCSKeys(x)) for x in tar_molecules]
        ref_fps = [list(MACCSkeys.GenMACCSKeys(x)) for x in ref_molecules]
        similarity_list = []
        Jaccard_score = compare_jaccard( tar_fps, ref_fps )
        result_file = open("Similarity_results.txt","w+")

        for i in range(len( tar_fps ) ):
                result_file.write( "\nThe similarity for " + str( tar_molecules[i].GetProp('DATABASE_ID')) + "\nwith %f above" % (threshold) + "\n" )
                similarity_list.append([])
                for j in range( len(ref_fps) ):
                        if ( Jaccard_score[i][j] > threshold):
                                result_file.write( str(Jaccard_score[i][j]) + "\t" + str( ref_molecules[j].GetProp('DATABASE_ID') ) + "\n"  )
                                similarity_list[i].append( Jaccard_score[i][j] )

        result_file.close()
        return similarity_list;

################################### MAIN FUNCTION ###################################

# Calculate the fingerprint from reference & target file, output two figerprint list

threshold = float( input("Enter the similarity threshold: "))
result = similarity_calculation( com_file, sdf_file, threshold )

for i in range( len( result ) ):
        for j in range( len( result[i] ) ):
                print(result[i][j])
        print("\n######################################################################\n")
DaidalosCheung commented 4 years ago

You can either use loops:

squares = []

for x in range(10):
    squares.append(x**2)

print squares
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

Or you can use list comprehensions to get the same result:

squares = [x**2 for x in range(10)]

print squares
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

https://www.pythonforbeginners.com/basics/list-comprehensions-in-python