Open markme123 opened 2 years ago
Hi @markme123 -- thanks for getting in touch. Can you please tell us:
fast5_subset_bin = "/opt/miniconda3/bin/fast5_subset" def fast5_subset(save_path, reads_list, bacth_name, fast5_input=args.fast5_input): sp.run([fast5_subset_bin, "-i", fast5_input, "-s", save_path, "-l", reads_list, "-t", "25", "-r", "-f", bacth_name])
It's hard to tell if it's a problem with some FAST5 files because there are so many files
Hi @markme123 -- how are you generating your reads list? Where did you get your input files from? Is there a chance you only have the reads from MinKNOW's "pass" output folder, so the filtering has already been completed?
`#!/usr/bin/env python
''' @File : split_f5.py @Time : 2022/03/29 16:09:25 @Author : jiangmian @Version : 1.0 '''
import argparse from ont_fast5_api.conversion_tools import fast5_subset as fs import subprocess as sp import os import pandas as pd from Bio import SeqIO import glob
parser = argparse.ArgumentParser(description="fast5 split, 本程序调用了seqkit") parser.add_argument('-i', '--fast5_input', dest="fast5_input", required=True, help="fast5 文件所在") parser.add_argument('-p', '--flow', dest='flow', required=True, help="芯片上机号") parser.add_argument('-s', '--save_path', dest="save_path", required=True, help="输出目录") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--summary', dest='summary', help="输入summary 文件绝对路径,--summary 与 --fastq 择一选择") group.add_argument('--fastq', dest='fastq', help="输入pass_fastq,fail_fastq路径, 示例为 /data/fastq_pass,/data/fastq_fail") parser.add_argument('-b', '--barcode', dest='barcode', action='store_true', help="有barcode") args = parser.parse_args()
fast5_subset_bin = "/opt/miniconda3/bin/fast5_subset" def fast5_subset(save_path, reads_list, bacth_name, fast5_input=args.fast5_input): sp.run([fast5_subset_bin, "-i", fast5_input, "-s", save_path, "-l", reads_list, "-t", "25", "-r", "-f", bacth_name])
def reads_list_out(reads_list, out_name): with open(out_name, "w") as out: for line in reads_list: out.write(line + "\n")
path = os.path.realpath(args.save_path) if bool(1 - os.path.exists(f"{path}/fast5_pass")): os.mkdir(f"{path}/fast5_pass") if bool(1 - os.path.exists(f"{path}/fast5_fail")): os.mkdir(f"{path}/fast5_fail")
if args.barcode:
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['read_id']
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
reads_list_out(passed, f"{path}/pass_reads_id_list")
fast5_subset(f"{path}/fast5_pass/{i}", f"{path}/pass_reads_idlist", f"{i}{args.flow}_pass")
fast5_subset(f"{path}/fast5_fail/{i}", f"{path}/fail_reads_idlist", f"{i}{args.flow}_fail")
print(f"{i} fast5 分离完毕")
elif args.fastq:
pass_fastq = args.fastq.split(",")[0]
fail_fastq = args.fastq.split(",")[1]
barcode = list(map(lambda x : x.split("/")[-1], glob.glob(f"{pass_fastq}/barcode")))
for i in barcode:
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{pass_fastq}/{i}/fastq", ">", f"{path}/pass_reads_id_list"], sheel=True)
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{fail_fastq}/{i}/fastq", ">", f"{path}/fail_reads_id_list"], sheel=True)
fast5_subset(f"{path}/fast5_pass/{i}", f"{path}/pass_reads_idlist", f"{i}{args.flow}_pass")
fast5_subset(f"{path}/fast5_fail/{i}", f"{path}/fail_reads_idlist", f"{i}{args.flow}_fail")
print(f"{i} fast5 分离完毕")
else:
print("--summary 与 --fastq 择一选择")
else:
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['read_id']
reads_list_out(passed, f"{path}/pass_reads_id_list")
fast5_subset(f"{path}/fast5_pass", f"{path}/pass_reads_id_list", f"{args.flow}_pass")
fast5_subset(f"{path}/fast5_fail", f"{path}/fail_reads_id_list", f"{args.flow}_fail")
elif args.fastq:
pass_fastq = args.fastq.split(",")[0]
fail_fastq = args.fastq.split(",")[1]
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{pass_fastq}/fastq", ">", f"{path}/pass_reads_id_list"], sheel=True)
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{fail_fastq}/*fastq", ">", f"{path}/fail_reads_id_list"], sheel=True)
fast5_subset(f"{path}/fast5_pass", f"{path}/pass_reads_id_list", f"{args.flow}_pass")
fast5_subset(f"{path}/fast5_fail", f"{path}/fail_reads_id_list", f"{args.flow}_fail")
else:
print("--summary 与 --fastq 择一选择")
print("fast5 分离完毕")
`
The above is all my code. The pass and FAIL parts are extracted separately
Hi @markme123 -- how about the other questions? Where did you get your input files from? Is there a chance you only have the reads from MinKNOW's "pass" output folder, so the filtering has already been completed?
I have confirmed that FAST5 is all
Hi @markme123 -- thanks very much for the extra information. I suspect the issue is down to Guppy splitting some of your reads into new ones -- this means that the read_id
field in your summary file is not necessary the same as the read_id
in your fast5 files. Instead, you need to use the parent_read_id
field in the summary file for your call to fast5_subset
.
For example, in your code, change these lines:
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['read_id']
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['read_id']
[...]
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['read_id']
reads_list_out(passed, f"{path}/pass_reads_id_list")
To this:
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['parent_read_id'] # <== changed to parent_read_id
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['parent_read_id'] # <== changed to parent_read_id
[...]
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['parent_read_id'] # <== changed to parent_read_id
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['parent_read_id'] # <== changed to parent_read_id
reads_list_out(passed, f"{path}/pass_reads_id_list")
Can you try that and see if it works? Note that this method only works with summary files -- it won't work with your seqkit
approach and fastqs.
Fast5_subset fast5 separation pass and FAIL, found that the latest R10 FAST5 has a lot of extraction can not be out . I checked that there were no fast5 incompletions, and this happened on many R10 versions, but not on R9 . A third of them didn't come out.
| 3108059 of 5017321|######################## | 61% ETA: 0:24:25