markme123 commented 2 years ago

Fast5_subset fast5 separation pass and FAIL, found that the latest R10 FAST5 has a lot of extraction can not be out . I checked that there were no fast5 incompletions, and this happened on many R10 versions, but not on R9 . A third of them didn't come out.

| 3108059 of 5017321|######################## | 61% ETA: 0:24:25

3140641 of 5017321|######################## | 62% ETA: 0:23:55 | 3178341 of 5017321|######################## | 63% ETA: 0:23:27
3208333 of 5017321|######################## | 63% ETA: 0:23:03 | 3238381 of 5017321|######################### | 64% ETA: 0:22:36
3273840 of 5017321|######################### | 65% ETA: 0:22:08 | 3303857 of 5017321|######################### | 65% ETA: 0:21:44
333769 \ 3364211 of 5017321|########################## | 67% ETA: 0:20:59 / 3378623 of 5017321|########################## | 67% ETA: 0:20:53 | 5017321 of 5017321|#######################################|100% Time: 0:43:08 INFO:Fast5Filter:3377354 reads extracted WARNING:Fast5Filter:1638696 reads not found!

fbrennen commented 2 years ago

Hi @markme123 -- thanks for getting in touch. Can you please tell us:

The full command you ran.
Whether or not there is a particular pattern to the reads that were not found -- are they, for example, already in a fail folder somewhere?

markme123 commented 2 years ago

fast5_subset_bin = "/opt/miniconda3/bin/fast5_subset" def fast5_subset(save_path, reads_list, bacth_name, fast5_input=args.fast5_input): sp.run([fast5_subset_bin, "-i", fast5_input, "-s", save_path, "-l", reads_list, "-t", "25", "-r", "-f", bacth_name])

It's hard to tell if it's a problem with some FAST5 files because there are so many files

fbrennen commented 2 years ago

Hi @markme123 -- how are you generating your reads list? Where did you get your input files from? Is there a chance you only have the reads from MinKNOW's "pass" output folder, so the filtering has already been completed?

markme123 commented 2 years ago

`#!/usr/bin/env python

-- encoding: utf-8 --

''' @File : split_f5.py @Time : 2022/03/29 16:09:25 @Author : jiangmian @Version : 1.0 '''

import argparse from ont_fast5_api.conversion_tools import fast5_subset as fs import subprocess as sp import os import pandas as pd from Bio import SeqIO import glob

parser = argparse.ArgumentParser(description="fast5 split, 本程序调用了seqkit") parser.add_argument('-i', '--fast5_input', dest="fast5_input", required=True, help="fast5 文件所在") parser.add_argument('-p', '--flow', dest='flow', required=True, help="芯片上机号") parser.add_argument('-s', '--save_path', dest="save_path", required=True, help="输出目录") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--summary', dest='summary', help="输入summary 文件绝对路径,--summary 与 --fastq 择一选择") group.add_argument('--fastq', dest='fastq', help="输入pass_fastq,fail_fastq路径, 示例为 /data/fastq_pass,/data/fastq_fail") parser.add_argument('-b', '--barcode', dest='barcode', action='store_true', help="有barcode") args = parser.parse_args()

fast5_subset_bin = "/opt/miniconda3/bin/fast5_subset" def fast5_subset(save_path, reads_list, bacth_name, fast5_input=args.fast5_input): sp.run([fast5_subset_bin, "-i", fast5_input, "-s", save_path, "-l", reads_list, "-t", "25", "-r", "-f", bacth_name])

def reads_list_out(reads_list, out_name): with open(out_name, "w") as out: for line in reads_list: out.write(line + "\n")

path = os.path.realpath(args.save_path) if bool(1 - os.path.exists(f"{path}/fast5_pass")): os.mkdir(f"{path}/fast5_pass") if bool(1 - os.path.exists(f"{path}/fast5_fail")): os.mkdir(f"{path}/fast5_fail")

if args.barcode: if args.summary: data = pd.read_csv(args.summary, sep="\t") barcode = set(list(data['barcode_arrangement'])) for i in barcode: failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['read_id'] passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['read_id'] reads_list_out(failed, f"{path}/fail_reads_id_list") reads_list_out(passed, f"{path}/pass_reads_id_list") fast5_subset(f"{path}/fast5_pass/{i}", f"{path}/pass_reads_idlist", f"{i}{args.flow}_pass") fast5_subset(f"{path}/fast5_fail/{i}", f"{path}/fail_reads_idlist", f"{i}{args.flow}_fail") print(f"{i} fast5 分离完毕") elif args.fastq: pass_fastq = args.fastq.split(",")[0] fail_fastq = args.fastq.split(",")[1] barcode = list(map(lambda x : x.split("/")[-1], glob.glob(f"{pass_fastq}/barcode"))) for i in barcode: sp.run(["seqkit", "fx2tab", "-i", "-n", f"{pass_fastq}/{i}/fastq", ">", f"{path}/pass_reads_id_list"], sheel=True) sp.run(["seqkit", "fx2tab", "-i", "-n", f"{fail_fastq}/{i}/fastq", ">", f"{path}/fail_reads_id_list"], sheel=True) fast5_subset(f"{path}/fast5_pass/{i}", f"{path}/pass_reads_idlist", f"{i}{args.flow}_pass") fast5_subset(f"{path}/fast5_fail/{i}", f"{path}/fail_reads_idlist", f"{i}{args.flow}_fail") print(f"{i} fast5 分离完毕") else: print("--summary 与 --fastq 择一选择")
else: if args.summary: data = pd.read_csv(args.summary, sep="\t") failed = data[data.passes_filtering == 0]['read_id'] reads_list_out(failed, f"{path}/fail_reads_id_list") passed = data[data.passes_filtering > 0]['read_id'] reads_list_out(passed, f"{path}/pass_reads_id_list") fast5_subset(f"{path}/fast5_pass", f"{path}/pass_reads_id_list", f"{args.flow}_pass") fast5_subset(f"{path}/fast5_fail", f"{path}/fail_reads_id_list", f"{args.flow}_fail") elif args.fastq: pass_fastq = args.fastq.split(",")[0] fail_fastq = args.fastq.split(",")[1] sp.run(["seqkit", "fx2tab", "-i", "-n", f"{pass_fastq}/fastq", ">", f"{path}/pass_reads_id_list"], sheel=True) sp.run(["seqkit", "fx2tab", "-i", "-n", f"{fail_fastq}/*fastq", ">", f"{path}/fail_reads_id_list"], sheel=True) fast5_subset(f"{path}/fast5_pass", f"{path}/pass_reads_id_list", f"{args.flow}_pass") fast5_subset(f"{path}/fast5_fail", f"{path}/fail_reads_id_list", f"{args.flow}_fail") else: print("--summary 与 --fastq 择一选择") print("fast5 分离完毕") `

markme123 commented 2 years ago

The above is all my code. The pass and FAIL parts are extracted separately

fbrennen commented 2 years ago

Hi @markme123 -- how about the other questions? Where did you get your input files from? Is there a chance you only have the reads from MinKNOW's "pass" output folder, so the filtering has already been completed?

markme123 commented 2 years ago

I have confirmed that FAST5 is all

fbrennen commented 2 years ago

Hi @markme123 -- thanks very much for the extra information. I suspect the issue is down to Guppy splitting some of your reads into new ones -- this means that the read_id field in your summary file is not necessary the same as the read_id in your fast5 files. Instead, you need to use the parent_read_id field in the summary file for your call to fast5_subset.

For example, in your code, change these lines:


if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['read_id']
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['read_id']

[...]

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['read_id']
reads_list_out(passed, f"{path}/pass_reads_id_list")

To this:

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['parent_read_id']  # <== changed to parent_read_id
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['parent_read_id']  # <== changed to parent_read_id

[...]

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['parent_read_id']  # <== changed to parent_read_id
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['parent_read_id']   # <== changed to parent_read_id
reads_list_out(passed, f"{path}/pass_reads_id_list")

Can you try that and see if it works? Note that this method only works with summary files -- it won't work with your seqkit approach and fastqs.

nanoporetech / ont_fast5_api

WARNING:Fast5Filter:xxx reads not found! #73

-- encoding: utf-8 --