tsudalab / ChemTS

Molecule Design using Monte Carlo Tree Search with Neural Rollout
153 stars 52 forks source link

How to Generate Molecules? #5

Open ALPHAYA-Japan opened 6 years ago

ALPHAYA-Japan commented 6 years ago

Hi, after saving the model in train_RNN.py, I loaded the model using model_from_json("model.json") and model.load_weights("model.h5")

I called the generate_smile(model,'CCCC') I got the following error:

ValueError: substring not found

What's wrong with my generate_smile()?

yangxiufengsia commented 6 years ago

Hi, what is the generate_smile(model, 'CCCC') function? Can you paste your code here, so that I can check where is wrong?

ALPHAYA-Japan commented 6 years ago
def generate_smile(model,val):
    new_smile = []
    start_smile_index = [val.index("C")]
    print(start_smile_index)
    while not start_smile_index[-1] == val.index("\n"):
        predictions = model.predict(start_smile_index)
        ##next atom probability
        smf = []
        for i in range (len(X)):
            sm = []
            for j in range(len(X[i])):
                #if np.argmax(predictions[i][j])=!0
                sm.append(np.argmax(predictions[i][j]))
            smf.append(sm)
        print(sm)
        print(smf)
        #print(len(sm))
        # new_smile.append(sampled_word)
    # return ''.join(new_smile)
    #sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    #return new_sentence
ALPHAYA-Japan commented 6 years ago

I found the above-mentioned function in your code (train_RNN.py). I could train the model and save it. But now, I would like to reload it and start generating new smiles. generate_smile(model,val) doesnt work. Could u plz tell me how did you used a pre-trained model for generating new molecules?

yangxiufengsia commented 6 years ago

Hi, the function generate_smile(model,val) in train_RNN.py is the test version that I used it for testing my first implementation, so please don't use it. Can you try the following code to generate your molecules?

from subprocess import Popen, PIPE from math import * import random import numpy as np import random as pr from copy import deepcopy from types import IntType, ListType, TupleType, StringTypes import itertools import time import math import tensorflow as tf import argparse from load_model import loaded_model from keras.preprocessing import sequence from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem import Descriptors from rdkit.Chem import MolFromSmiles, MolToSmiles import sys from make_smile import zinc_data_with_bracket_original, zinc_processed_with_bracket

def chem_kn_simulation(model,state,val): all_posible=[]

end="\n"

position=[]
position.extend(state)
#position.append(added_nodes)
total_generated=[]
new_compound=[]
get_int_old=[]
for j in range(len(position)):
    get_int_old.append(val.index(position[j]))

get_int=get_int_old

x=np.reshape(get_int,(1,len(get_int)))
x_pad= sequence.pad_sequences(x, maxlen=82, dtype='int32',
    padding='post', truncating='pre', value=0.)
while not get_int[-1] == val.index(end):
    predictions=model.predict(x_pad)
    #print "shape of RNN",predictions.shape
    preds=np.asarray(predictions[0][len(get_int)-1]).astype('float64')
    preds = np.log(preds) / 1.0
    preds = np.exp(preds) / np.sum(np.exp(preds))
    next_probas = np.random.multinomial(1, preds, 1)
    next_int=np.argmax(next_probas)
    a=predictions[0][len(get_int)-1]
    next_int_test=sorted(range(len(a)), key=lambda i: a[i])[-10:]
    get_int.append(next_int)
    x=np.reshape(get_int,(1,len(get_int)))
    x_pad = sequence.pad_sequences(x, maxlen=82, dtype='int32',
        padding='post', truncating='pre', value=0.)
    if len(get_int)>82:
        break
total_generated.append(get_int)
all_posible.extend(total_generated)

print all_possible

return all_posible

def predict_smile(all_posible,val): new_compound=[] for i in range(len(all_posible)): total_generated=all_posible[i]

    generate_smile=[]

    for j in range(len(total_generated)-1):
        generate_smile.append(val[total_generated[j]])
    generate_smile.remove("&")
    new_compound.append(generate_smile)

return new_compound

def make_input_smile(generate_smile): new_compound=[] for i in range(len(generate_smile)): middle=[] for j in range(len(generate_smile[i])): middle.append(generate_smile[i][j]) com=''.join(middle) new_compound.append(com) return new_compound

Example of using the above three functions to generate molecules

val=['\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]', 's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]', '[nH+]', '\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7', 'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]', '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]', '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']

all_posible=chem_kn_simulation(model,['C','O','C'],val) generate_smile=predict_smile(all_posible,val) new_compound=make_input_smile(generate_smile) print new_compound

yangxiufengsia commented 6 years ago

`from subprocess import Popen, PIPE from math import * import random import numpy as np import random as pr from copy import deepcopy from types import IntType, ListType, TupleType, StringTypes import itertools import time import math import tensorflow as tf import argparse from load_model import loaded_model from keras.preprocessing import sequence from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem import Descriptors from rdkit.Chem import MolFromSmiles, MolToSmiles import sys from make_smile import zinc_data_with_bracket_original, zinc_processed_with_bracket

def chem_kn_simulation(model,state,val): all_posible=[]

end="\n"

position=[]
position.extend(state)
#position.append(added_nodes)
total_generated=[]
new_compound=[]
get_int_old=[]
for j in range(len(position)):
    get_int_old.append(val.index(position[j]))

get_int=get_int_old

x=np.reshape(get_int,(1,len(get_int)))
x_pad= sequence.pad_sequences(x, maxlen=82, dtype='int32',
    padding='post', truncating='pre', value=0.)
while not get_int[-1] == val.index(end):
    predictions=model.predict(x_pad)
    #print "shape of RNN",predictions.shape
    preds=np.asarray(predictions[0][len(get_int)-1]).astype('float64')
    preds = np.log(preds) / 1.0
    preds = np.exp(preds) / np.sum(np.exp(preds))
    next_probas = np.random.multinomial(1, preds, 1)
    next_int=np.argmax(next_probas)
    a=predictions[0][len(get_int)-1]
    next_int_test=sorted(range(len(a)), key=lambda i: a[i])[-10:]
    get_int.append(next_int)
    x=np.reshape(get_int,(1,len(get_int)))
    x_pad = sequence.pad_sequences(x, maxlen=82, dtype='int32',
        padding='post', truncating='pre', value=0.)
    if len(get_int)>82:
        break
total_generated.append(get_int)
all_posible.extend(total_generated)

print all_possible

return all_posible

def predict_smile(all_posible,val): new_compound=[] for i in range(len(all_posible)): total_generated=all_posible[i]

    generate_smile=[]

    for j in range(len(total_generated)-1):
        generate_smile.append(val[total_generated[j]])
    generate_smile.remove("&")
    new_compound.append(generate_smile)

return new_compound

def make_input_smile(generate_smile): new_compound=[] for i in range(len(generate_smile)): middle=[] for j in range(len(generate_smile[i])): middle.append(generate_smile[i][j]) com=''.join(middle) new_compound.append(com) return new_compound

Example of using the above three functions to generate molecules

val=['\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]', 's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]', '[nH+]', '\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7', 'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]', '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]', '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']

all_posible=chem_kn_simulation(model,['C','O','C'],val) generate_smile=predict_smile(all_posible,val) new_compound=make_input_smile(generate_smile) print new_compound `

yangxiufengsia commented 6 years ago

test.txt

yangxiufengsia commented 6 years ago

save the test.txt as test.py, and you can use it generate molecules.

ALPHAYA-Japan commented 6 years ago

@yangxiufengsia Thanks for the great help. It WORKED! However, I modified your code as follows, there were some minor mistakes that I tried to correct them:

import sys
import math
import random
import numpy as np
import random as pr
from rdkit import Chem
from make_smile import *
from keras.models import load_model
from keras.preprocessing import sequence

def chem_kn_simulation(model,state,val):
    max_len = 81
    get_int = [val.index(state[j]) for j in range(len(state))]
    x       = np.reshape(get_int,(1,len(get_int)))
    x_pad   = sequence.pad_sequences(x, maxlen = max_len, dtype = 'int32', padding = 'post', truncating = 'pre', value = 0.0)

    while not get_int[-1] == val.index("\n"):
        predictions   = model.predict(x_pad)
        #print("shape of RNN",predictions.shape)
        a             = predictions[0][len(get_int) - 1]
        preds         = np.asarray(a).astype('float64')
        # preds         = np.log(preds) / 1.0
        # preds         = np.exp(preds)
        preds         = preds / np.sum(preds)
        next_probas   = np.random.multinomial(1, preds, 1)
        next_int      = np.argmax(next_probas)
        next_int_test = sorted(range(len(a)), key = lambda i: a[i])[-10:]
        get_int.append(next_int)
        x             = np.reshape(get_int,(1,len(get_int)))
        x_pad         = sequence.pad_sequences(x, maxlen = max_len, dtype = 'int32', padding='post', truncating='pre', value=0.0)
        if len(get_int) > max_len:
            break

    # print([get_int])
    return [get_int]

def predict_smile(all_posible,val):
    new_compound = []
    for i in range(len(all_posible)):
        generate_smile  = [val[all_posible[i][j]] for j in range(len(all_posible[i])-1)]
        # generate_smile.remove("&")
        new_compound.append(generate_smile)

    return new_compound

def make_input_smile(generate_smile):
    new_compound = []
    for i in range(len(generate_smile)):
        middle = [generate_smile[i][j] for j in range(len(generate_smile[i]))]
        com    = ''.join(middle)
        new_compound.append(com)

    return new_compound

### Example of using the above three functions to generate molecules
val = [ '\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F',
        '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]',
        's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]',
        '[nH+]', '\\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7',
        'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]',
        '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]',
        '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']

if __name__ == "__main__":
    smiles         = sys.argv[2]
    _, all_smile = zinc_processed_with_bracket([smiles])
    all_smile[0].remove('&')
    all_smile[0].remove('\n')
    print(all_smile[0])
    model          = load_model(sys.argv[1])
    all_posible    = chem_kn_simulation(model,all_smile[0],val)
    # all_posible    = chem_kn_simulation(model,['C','O','C'],val)
    generate_smile = predict_smile(all_posible,val)
    new_compound = make_input_smile(generate_smile)

    print(new_compound)
ALPHAYA-Japan commented 6 years ago

I just wonder if val is fixed or can be generated.

val = [ '\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F',
        '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]',
        's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]',
        '[nH+]', '\\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7',
        'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]',
        '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]',
        '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']
yangxiufengsia commented 6 years ago

@ALPHAYA-Japan good to know you can generate molecules now. val is obtained from training dataset and the symbols in val are used as the nodes of the search tree. Of course, you can use different symbols.