kad-ecoli / rna3db

maintain local copy of RNA structure database
0 stars 0 forks source link

Preliminary data for SPOT-RNA #9

Closed marc-harary closed 3 years ago

marc-harary commented 3 years ago

The preliminary data on the trained molecules via SPOT-RNA is extremely poor. I'm wondering why this is. scores.zip

marc-harary commented 3 years ago

>1a9l
GGGUGACUCCAGAGGUCGAGAGACCGGAGAUAUCACCC

# 1a9l
i             j
1             38            
2             37            
3             36            
4             35            
5             34            
6             33            
7             29            
8             28            
9             27            
10            26            
14            25            
15            24            
16            23            
17            22  
marc-harary commented 3 years ago

def main():
    labels = glob("/gpfs/ysm/home/mah258/Pyle/evaluate/output/cleaned/labels/*")
    seqs = glob("/gpfs/ysm/home/mah258/Pyle/evaluate/output/cleaned/fastas/*")
    labels.sort()
    seqs.sort()

    dicts = []

    for counter, (label, seq) in enumerate(zip(labels, seqs)):
        name, _ = os.path.splitext(os.path.basename(seq))
        print(f"Reading file {counter} ({name})...")
        label_list, BPnat = read_label(label)
        spotSeq, spotCt, spotPred = spotPredict(seq)
        spotF1, spotMCC = calcF1MCC(spotSeq, spotCt, label_list)
        row = {
                "id": name,
                "BPnat": BPnat,
                "F1_SPOT-RNA": spotF1,
                "MCC_SPOT-RNA": spotMCC,
                "BPpred_SPOT-RNA": spotPred,
                # "F1_mxfold2": mxF1,
                # "MCC_mxfold2": mxMCC,
                # "BPpred_mxfold2": mxPred,
                # "F1_e2efold": e2eF1,
                # "MCC_e2efold": e2eMCC,
                # "BPpred_e2efold": e2ePred
        }
        dicts.append(row)
        with open(f"./output/spot/{name}.json", "w") as f:
            json.dump(row, f)

    df = pd.DataFrame(dicts)
    df.to_csv("./output/stats/scores.csv", index=False)

if __name__ == "__main__":
    main()
marc-harary commented 3 years ago

def spotPredict(input_path, output_path="/gpfs/ysm/home/mah258/Pyle/evaluate/output/spot"):
    commandLst = [
        "python",
        "../SPOT-RNA/SPOT-RNA.py",
        "--inputs=" + input_path,
        "--outputs=" + output_path
    ]
    subprocess.check_output(commandLst)
    name, _ = os.path.splitext(os.path.basename(input_path))
    ctFile = os.path.join(output_path, name+".ct")
    sequence, ct_list, BPpred = read_ct(ctFile)
    return sequence, ct_list, BPpred
marc-harary commented 3 years ago

38      1a9l        SPOT-RNA output

1       G       0       2       38      1
2       G       1       3       37      2
3       G       2       4       36      3
4       U       3       5       35      4
5       G       4       6       34      5
6       A       5       7       33      6
7       C       6       8       29      7
8       U       7       9       28      8
9       C       8       10      27      9
10      C       9       11      26      10
11      A       10      12      0       11
12      G       11      13      0       12
13      A       12      14      0       13
14      G       13      15      25      14
15      G       14      16      24      15
16      U       15      17      23      16
17      C       16      18      22      17
18      G       17      19      21      18
19      A       18      20      0       19
20      G       19      21      0       20
21      A       20      22      18      21
22      G       21      23      17      22
23      A       22      24      16      23
24      C       23      25      15      24
25      C       24      26      14      25
26      G       25      27      10      26
27      G       26      28      9       27
28      A       27      29      8       28
29      G       28      30      7       29
30      A       29      31      0       30
31      U       30      32      0       31
32      A       31      33      0       32
33      U       32      34      6       33
34      C       33      35      5       34
35      A       34      36      4       35
36      C       35      37      3       36
37      C       36      38      2       37
38      C       37      0       1       38