Open awgymer opened 9 months ago
I have a similar issue. I have developed a code that identify reads that are completely soft-clipped (e.g. 150S) See code below
import pysam
import re
import sys
import os
if len(sys.argv) != 2:
print("Usage: python improve_bam.py input_bam_path")
sys.exit(1)
# Input BAM file path provided as a command-line argument
input_bam_path = sys.argv[1]
# Generate the output BAM file path
output_directory = os.path.dirname(input_bam_path)
input_filename = os.path.basename(input_bam_path)
output_filename = input_filename.replace(".bam", "_improved.bam")
output_bam_path = os.path.join(output_directory, output_filename)
# Additional parameters
POS = 0
FLAG = 4 # Set FLAG to indicate an unmapped read
RNAME = -1 # Set RNAME to -1 to indicate unmapped
MAPQ = 0
CIGAR = "*" # Set CIGAR to '*' to indicate no cigar string
RNEXT = -1 # Set RNEXT to -1 to indicate no "next read"
PNEXT = -1 # Set PNEXT to -1 to indicate no "next position"
TLEN = 0 # Set TLEN to 0 to indicate no template length
#print(f"Flag: {FLAG}")
#print(f"MAPQ: {MAPQ}")
#print(f"RNAME: {RNAME}")
#print(f"RNEXT: {RNEXT}")
#print(f"PNEXT: {PNEXT}")
#print(f"TLEN: {TLEN}")
# Open input BAM file
input_bam = pysam.AlignmentFile(input_bam_path, "rb")
# Open output BAM file for writing
output_bam = pysam.AlignmentFile(output_bam_path, "wb", header=input_bam.header)
total_reads = 0
changed_reads = 0
for read in input_bam:
total_reads += 1
# Check if the CIGAR string matches the pattern: <number>S
cigar_string = read.cigarstring
if cigar_string and re.match(r'^\d+S$', cigar_string):
# Print the read before modification
print("DURING MODIFICATION:")
print(f"Flag-Before: {read.flag}")
read.flag = FLAG
print(f"Flag: {read.flag}")
print(f"POS-Before: {read.pos}")
read.pos = POS
print(f"POS: {read.pos}")
print(f"MAPQ-Before: {read.mapping_quality}")
read.mapping_quality = MAPQ
print(f"MAPQ: {read.mapping_quality}")
print(f"RNAME-Before: {read.rname}")
read.rname = RNAME
print(f"RNAME: {read.rname}")
print(f"RNEXT-Before: {read.rnext}")
read.rnext = RNEXT
print(f"RNEXT: {read.rnext}")
print(f"PNEXT-Before: {read.pnext}")
read.pnext = PNEXT
print(f"PNEXT: {read.pnext}")
print(f"TLEN-Before: {read.tlen}")
read.tlen = TLEN
print(f"TLEN: {read.tlen}")
print(f"CIGAR-Before: {read.cigarstring}")
read.cigarstring = CIGAR
print(f"CIGAR: {read.cigarstring}")
# Print the read after modification
print("AFTER MODIFICATION:")
print(f"Flag: {read.flag}")
print(f"MAPQ: {read.mapping_quality}")
print(f"RNAME: {read.rname}")
print(f"RNEXT: {read.rnext}")
print(f"PNEXT: {read.pnext}")
print(f"TLEN: {read.tlen}")
print(f"POS: {read.pos}")
changed_reads += 1
# Write the modified or unmodified read to the output BAM file
output_bam.write(read)
# Close the input and output BAM files
input_bam.close()
output_bam.close()
# Print the total and changed reads counts
print("Total reads in the BAM file:", total_reads)
print("Changed reads:", changed_reads)
This doesn't return any error. However, the reason is that the 0 is not properly written.
In my code the print statements return a 0 in POS. HOWEVER, I had a validation after this with ValidateSamFile and this retunr this
ERROR::INVALID_ALIGNMENT_START:Record 1699229, Read name ML-PT1-10:12:000H00CHL:1:13107:9070:8182, Alignment start should be 0 because reference name = *.
and then when I look at this read, there is a one
ML-PT1-10:12:000H00CHL:1:13107:9070:8182 4 * 1 0 * * 0 0 CCTAAGAGCAATCAGTGGGGAACAAGAAGTGGAGAATGTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG AAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFA00AFFF0FFFFFFFFFAFFFFFFFFFFFFFF000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF SA:Z:chr2,32916401,+,113S38M,0,0; RG:Z:MiniSeq_TruSight_Tumor_15_FFPE_and_control_samples_-27687666_HD729_S5andS6_L001_R1_001 UQ:i:74 AS:i:30
When trying to write a record that has
POS 0
it throws aValueError: Start coordinate must be non-negative
becausestart
is set to-1
.The issue is that
POS 0
is a legitimate value according to the spec:It should be noted that this record is one that is being created from an existing record that was read in fine. e.g. this breaks: