parser.add_argument("--infile", type=str, required=True,
help="location to csv of SMILES")
parser.add_argument("--outfile", type=str, default=os.path.join(os.getcwd(), "phakin_output.csv"),
help="location and file name for output")
parser.add_argument("--smiles_col", type=str, default="SMILES",
help="column name containing SMILES of interest"),
# Code only works if calc_ad is True. Need to fix before allowing user
# choice.
#parser.add_argument("--ad", action="store_true",
# help="calculate the AD")
options = parser.parse_args()
return options
Renamed main to adme_prediction so I remember that this is
def default(key, d):
if key in d.keys():
return d[key]
else:
return False
# Putting the models direct
models = sorted([f for f in glob.glob("./models/*.pgz")], key=lambda x: x.split("_")[1])
models_data = sorted([f for f in glob.glob("./models/*.pbz2")], key=lambda x: x.split("_")[1])
values = {}
for model_endpoint, model_data in zip(models, models_data):
if not default(MODEL_DICT_INVERT[os.path.basename(model_endpoint)], kwargs):
continue
with gzip.open(model_endpoint, 'rb') as f:
model = cPickle.load(f)
with bz2.BZ2File(model_data, 'rb') as f:
model_data = cPickle.load(f)
pred, pred_proba, ad = run_prediction(model, model_data, smiles, calculate_ad=calculate_ad)
svg_str = ""
if make_prop_img:
svg_str = get_prob_map(model, smiles)
values.setdefault(MODEL_DICT_INVERT[os.path.basename(model_endpoint)], []).append([int(pred), str(round(float(pred_proba) * 100, 2)) + "%", AD_DICT[ad], svg_str])
processed_results = []
for key, val in values.items():
if key in ['Hepatic Stability', 'Renal Clearance', 'Plasma Half-life', 'Oral Bioavailability']:
new_pred = multiclass_ranking([_[0] for _ in val])
if new_pred == 0:
processed_results.append([key, "Inconsistent result: no prediction", "Very unconfident", "NA", ""])
else:
# this is because of how the hierarchical model works
if new_pred in [1, 2]:
p = 0
else:
p = new_pred - 2
processed_results.append([key, CLASSIFICATION_DICT[key][new_pred], val[p][1], val[p][2], val[p][3]])
else:
processed_results.append([key, CLASSIFICATION_DICT[key][val[0][0]], val[0][1], val[0][2], val[0][3]])
# Store results in output dictionary.
for adme_prop in processed_results:
adme_prop_name = adme_prop[0]
output_dict[f"{adme_prop_name}"].append(adme_prop[1])
output_dict[f"{adme_prop_name}_proba"].append(adme_prop[2])
output_dict[f"{adme_prop_name}_AD"].append(adme_prop[3])
return output_dict
data = main(smiles, calculate_ad=calculate_ad, **MODEL_DICT)
for model_name, pred, predproba, ad, in data:
print(pred, pred_proba, ad)
try:
pred_proba = float(pred_proba[:-1]) / 100 # covert back to 0-1 float
row[model_name] = pred
row[model_name + "_proba"] = pred_proba if pred == 1 else 1 - pred_proba # this is to make sure its proba for class 1
except ValueError:
row[model_name] = "No prediction" # if pred_proba is string skip
if calculate_ad:
row[model_name + "_AD"] = ad
writer.writerow(row)
return string_file.getvalue()
if name == "main":
# Load in user inputs into code
args = get_parser_options()
# Input and Output Paths
input_csv = args.infile
output_csv = args.outfile
# Column name that contains the smile string
smile_col = args.smiles_col
# Should we calculate applicability domain
# Code does not work if calculate_ad is set to false.
calculate_ad = True
# Load in csv file
molecule_df = pd.read_csv(f"{input_csv}")
# Get list of smiles to compute model on.
smiles_list = molecule_df[f'{smile_col}'].to_numpy()
# Dictionary to store results.
output_dict={'SMILES':[]}
for _key in MODEL_DICT.keys():
output_dict.update({_key:[]})
output_dict.update({f"{_key}_proba":[]})
if calculate_ad:
output_dict.update({f"{_key}_AD":[]})
# Run model and save in dictionary we made above.
for smile in smiles_list:
output_dict['SMILES'].append(smile)
output_dict = adme_prediction(smile, output_dict, calculate_ad=calculate_ad, make_prop_img=False, **MODEL_DICT)
# Save output
output_df = pd.DataFrame(output_dict)
output_df.to_csv(f"{output_csv}")
When I tried to run python phankinpro.py, I noticed that the code did not produce any outputs.
I made some changes to the file to make it run and save a civ file with the results. Please feel free to use this code if you see fit.
`
Function to save user inputs from argparse.
def get_parser_options(): """ @return: argparse options @rtype: class argparse.Namespace """ parser = argparse.ArgumentParser()
Renamed main to adme_prediction so I remember that this is
the functon that makes the adme prediction.
def adme_prediction(smiles, output_dict, calculate_ad=True, make_prop_img=False, **kwargs):
Obsolete. Using pandas to save csv file.
def write_csv_file(smiles_list, calculate_ad=False):
headers = []
for _key in MODEL_DICT.keys():
headers.append(_key)
headers.append(_key+"_proba")
if calculate_ad:
headers.append(_key+"_AD")
string_file = StringIO()
writer = csv.DictWriter(string_file, fieldnames=['SMILES', *headers])
writer.writeheader()
for smiles in tqdm(smiles_list):
molecule = MolFromSmiles(smiles)
row = {'SMILES': smiles}
if molecule is None:
row['SMILES'] = f"(invalid){smiles}"
writer.writerow(row)
continue
data = main(smiles, calculate_ad=calculate_ad, **MODEL_DICT)
for model_name, pred, predproba, ad, in data:
print(pred, pred_proba, ad)
try:
pred_proba = float(pred_proba[:-1]) / 100 # covert back to 0-1 float
row[model_name] = pred
row[model_name + "_proba"] = pred_proba if pred == 1 else 1 - pred_proba # this is to make sure its proba for class 1
except ValueError:
row[model_name] = "No prediction" # if pred_proba is string skip
if calculate_ad:
row[model_name + "_AD"] = ad
writer.writerow(row)
return string_file.getvalue()
if name == "main":
`