molecularmodelinglab / PhaKinPro

Pharmacokinetic property prediction with QSAR modeling
https://phakinpro.mml.unc.edu/
MIT License
6 stars 2 forks source link

Fail to run Phakinpro #4

Open dixonth1 opened 2 months ago

dixonth1 commented 2 months ago

When I tried to run python phankinpro.py, I noticed that the code did not produce any outputs.

I made some changes to the file to make it run and save a civ file with the results. Please feel free to use this code if you see fit.

`

Function to save user inputs from argparse.

def get_parser_options(): """ @return: argparse options @rtype: class argparse.Namespace """ parser = argparse.ArgumentParser()

parser.add_argument("--infile", type=str, required=True,
                    help="location to csv of SMILES")
parser.add_argument("--outfile", type=str, default=os.path.join(os.getcwd(), "phakin_output.csv"),
                    help="location and file name for output")
parser.add_argument("--smiles_col", type=str, default="SMILES",
                    help="column name containing SMILES of interest"),

# Code only works if calc_ad is True. Need to fix before allowing user
# choice.
#parser.add_argument("--ad", action="store_true",
#                    help="calculate the AD")

options = parser.parse_args()
return options

Renamed main to adme_prediction so I remember that this is

the functon that makes the adme prediction.

def adme_prediction(smiles, output_dict, calculate_ad=True, make_prop_img=False, **kwargs):

def default(key, d):
    if key in d.keys():
        return d[key]
    else:
        return False

# Putting the models direct
models = sorted([f for f in glob.glob("./models/*.pgz")], key=lambda x: x.split("_")[1])
models_data = sorted([f for f in glob.glob("./models/*.pbz2")], key=lambda x: x.split("_")[1])

values = {}

for model_endpoint, model_data in zip(models, models_data):
    if not default(MODEL_DICT_INVERT[os.path.basename(model_endpoint)], kwargs):
        continue
    with gzip.open(model_endpoint, 'rb') as f:
        model = cPickle.load(f)

    with bz2.BZ2File(model_data, 'rb') as f:
        model_data = cPickle.load(f)

    pred, pred_proba, ad = run_prediction(model, model_data, smiles, calculate_ad=calculate_ad)

    svg_str = ""
    if make_prop_img:
        svg_str = get_prob_map(model, smiles)

    values.setdefault(MODEL_DICT_INVERT[os.path.basename(model_endpoint)], []).append([int(pred), str(round(float(pred_proba) * 100, 2)) + "%", AD_DICT[ad], svg_str])

processed_results = []

for key, val in values.items():
    if key in ['Hepatic Stability', 'Renal Clearance', 'Plasma Half-life', 'Oral Bioavailability']:
        new_pred = multiclass_ranking([_[0] for _ in val])
        if new_pred == 0:
            processed_results.append([key, "Inconsistent result: no prediction", "Very unconfident", "NA", ""])
        else:
            # this is because of how the hierarchical model works
            if new_pred in [1, 2]:
                p = 0
            else:
                p = new_pred - 2
            processed_results.append([key, CLASSIFICATION_DICT[key][new_pred], val[p][1], val[p][2], val[p][3]])
    else:
        processed_results.append([key, CLASSIFICATION_DICT[key][val[0][0]], val[0][1], val[0][2], val[0][3]])

# Store results in output dictionary.
for adme_prop in processed_results:
    adme_prop_name = adme_prop[0]
    output_dict[f"{adme_prop_name}"].append(adme_prop[1])
    output_dict[f"{adme_prop_name}_proba"].append(adme_prop[2])
    output_dict[f"{adme_prop_name}_AD"].append(adme_prop[3])

return output_dict

Obsolete. Using pandas to save csv file.

def write_csv_file(smiles_list, calculate_ad=False):

headers = []

for _key in MODEL_DICT.keys():

headers.append(_key)

headers.append(_key+"_proba")

if calculate_ad:

headers.append(_key+"_AD")

string_file = StringIO()

writer = csv.DictWriter(string_file, fieldnames=['SMILES', *headers])

writer.writeheader()

for smiles in tqdm(smiles_list):

molecule = MolFromSmiles(smiles)

row = {'SMILES': smiles}

if molecule is None:

row['SMILES'] = f"(invalid){smiles}"

writer.writerow(row)

continue

data = main(smiles, calculate_ad=calculate_ad, **MODEL_DICT)

for model_name, pred, predproba, ad, in data:

print(pred, pred_proba, ad)

try:

pred_proba = float(pred_proba[:-1]) / 100 # covert back to 0-1 float

row[model_name] = pred

row[model_name + "_proba"] = pred_proba if pred == 1 else 1 - pred_proba # this is to make sure its proba for class 1

except ValueError:

row[model_name] = "No prediction" # if pred_proba is string skip

if calculate_ad:

row[model_name + "_AD"] = ad

writer.writerow(row)

return string_file.getvalue()

if name == "main":

# Load in user inputs into code
args = get_parser_options()

# Input and Output Paths
input_csv = args.infile
output_csv = args.outfile

# Column name that contains the smile string
smile_col = args.smiles_col

# Should we calculate applicability domain
# Code does not work if calculate_ad is set to false.
calculate_ad = True

# Load in csv file
molecule_df = pd.read_csv(f"{input_csv}")

# Get list of smiles to compute model on.
smiles_list = molecule_df[f'{smile_col}'].to_numpy()

# Dictionary to store results.
output_dict={'SMILES':[]}
for _key in MODEL_DICT.keys():
    output_dict.update({_key:[]})
    output_dict.update({f"{_key}_proba":[]})
    if calculate_ad:
        output_dict.update({f"{_key}_AD":[]})

# Run model and save in dictionary we made above.
for smile in smiles_list:
    output_dict['SMILES'].append(smile)
    output_dict = adme_prediction(smile, output_dict, calculate_ad=calculate_ad, make_prop_img=False, **MODEL_DICT)

# Save output
output_df = pd.DataFrame(output_dict)
output_df.to_csv(f"{output_csv}")

`