Closed marc-harary closed 3 years ago
>1a9l
GGGUGACUCCAGAGGUCGAGAGACCGGAGAUAUCACCC
# 1a9l
i j
1 38
2 37
3 36
4 35
5 34
6 33
7 29
8 28
9 27
10 26
14 25
15 24
16 23
17 22
def main():
labels = glob("/gpfs/ysm/home/mah258/Pyle/evaluate/output/cleaned/labels/*")
seqs = glob("/gpfs/ysm/home/mah258/Pyle/evaluate/output/cleaned/fastas/*")
labels.sort()
seqs.sort()
dicts = []
for counter, (label, seq) in enumerate(zip(labels, seqs)):
name, _ = os.path.splitext(os.path.basename(seq))
print(f"Reading file {counter} ({name})...")
label_list, BPnat = read_label(label)
spotSeq, spotCt, spotPred = spotPredict(seq)
spotF1, spotMCC = calcF1MCC(spotSeq, spotCt, label_list)
row = {
"id": name,
"BPnat": BPnat,
"F1_SPOT-RNA": spotF1,
"MCC_SPOT-RNA": spotMCC,
"BPpred_SPOT-RNA": spotPred,
# "F1_mxfold2": mxF1,
# "MCC_mxfold2": mxMCC,
# "BPpred_mxfold2": mxPred,
# "F1_e2efold": e2eF1,
# "MCC_e2efold": e2eMCC,
# "BPpred_e2efold": e2ePred
}
dicts.append(row)
with open(f"./output/spot/{name}.json", "w") as f:
json.dump(row, f)
df = pd.DataFrame(dicts)
df.to_csv("./output/stats/scores.csv", index=False)
if __name__ == "__main__":
main()
def spotPredict(input_path, output_path="/gpfs/ysm/home/mah258/Pyle/evaluate/output/spot"):
commandLst = [
"python",
"../SPOT-RNA/SPOT-RNA.py",
"--inputs=" + input_path,
"--outputs=" + output_path
]
subprocess.check_output(commandLst)
name, _ = os.path.splitext(os.path.basename(input_path))
ctFile = os.path.join(output_path, name+".ct")
sequence, ct_list, BPpred = read_ct(ctFile)
return sequence, ct_list, BPpred
38 1a9l SPOT-RNA output
1 G 0 2 38 1
2 G 1 3 37 2
3 G 2 4 36 3
4 U 3 5 35 4
5 G 4 6 34 5
6 A 5 7 33 6
7 C 6 8 29 7
8 U 7 9 28 8
9 C 8 10 27 9
10 C 9 11 26 10
11 A 10 12 0 11
12 G 11 13 0 12
13 A 12 14 0 13
14 G 13 15 25 14
15 G 14 16 24 15
16 U 15 17 23 16
17 C 16 18 22 17
18 G 17 19 21 18
19 A 18 20 0 19
20 G 19 21 0 20
21 A 20 22 18 21
22 G 21 23 17 22
23 A 22 24 16 23
24 C 23 25 15 24
25 C 24 26 14 25
26 G 25 27 10 26
27 G 26 28 9 27
28 A 27 29 8 28
29 G 28 30 7 29
30 A 29 31 0 30
31 U 30 32 0 31
32 A 31 33 0 32
33 U 32 34 6 33
34 C 33 35 5 34
35 A 34 36 4 35
36 C 35 37 3 36
37 C 36 38 2 37
38 C 37 0 1 38
The preliminary data on the trained molecules via SPOT-RNA is extremely poor. I'm wondering why this is. scores.zip