Closed Koeng101 closed 7 months ago
Channels should be passing values, not pointers. This needs to be fixed.
Things to do:
func TrimBarcodeWithChannels(barcodeName string, fastqInput <-chan fastq.Read, fastqOutput chan<- fastq.Read) error {
for {
select {
case data, ok := <-fastqInput:
if !ok {
// If the input channel is closed, we close the output channel and return
close(fastqOutput)
return nil
}
trimmedRead, err := TrimBarcode(barcodeName, *data)
if err != nil {
close(fastqOutput)
return err
}
fastqOutput <- &trimmedRead
}
}
}
// TrimBarcode takes a barcodeName and a fastqSequence and returns a trimmed
// barcode.
func TrimBarcode(barcodeName string, fastqRead fastq.Read) (fastq.Read, error) {
// Copy into new fastq read
var newFastqRead fastq.Read
newFastqRead.Identifier = fastqRead.Identifier
newFastqRead.Optionals = make(map[string]string)
for key, value := range fastqRead.Optionals {
newFastqRead.Optionals[key] = value
}
// Get the barcode
barcode, ok := NativeBarcodeMap[barcodeName]
if !ok {
return newFastqRead, fmt.Errorf("barcode %s not found in NativeBarcodeMap.", barcodeName)
}
// Trim in the forward direction
var sequence string
var quality string
sequence = fastqRead.Sequence
quality = fastqRead.Quality
if len(sequence) > 80 {
score, alignment, err := Align(sequence[:80], barcode.Forward)
if err != nil {
return newFastqRead, err
}
// Empirically from looking around, it seems to me that approximately a
// score of 21 looks like a barcode match. This is almost by definition a
// magic number, so it is defined above as so.
if score >= ScoreMagicNumber {
modifiedAlignment := strings.ReplaceAll(alignment, "-", "")
index := strings.Index(sequence, modifiedAlignment)
// The index needs to be within the first 80 base pairs. Usually the
// native adapter + native barcode is within the first ~70 bp, but we
// put a small buffer.
if index != -1 {
// 7 here is a magic number between the native barcode and your
// target sequence. It's just a number that exists irl, so it is
// not defined above.
newStart := index + 7
if newStart < len(sequence) {
sequence = sequence[newStart:]
quality = quality[newStart:]
}
}
}
}
// Now do the same thing with the reverse strand.
if len(sequence) > 80 {
score, alignment, err := Align(sequence[len(sequence)-80:], barcode.Reverse)
if err != nil {
return newFastqRead, err
}
if score >= ScoreMagicNumber {
modifiedAlignment := strings.ReplaceAll(alignment, "-", "")
index := strings.Index(sequence, modifiedAlignment)
// This time we need to check within the last 80 base pairs.
if index != -1 {
newEnd := index - 7
if newEnd < len(sequence) {
sequence = sequence[:newEnd]
quality = quality[:newEnd]
}
}
}
}
newFastqRead.Sequence = sequence
newFastqRead.Quality = quality
return newFastqRead, nil
}
// Align uses the SmithWaterman algorithm to align a barcode to a sequence.
// It is used because it is a rather simple algorithm, and since barcodes are
// sufficiently small, fast enough.
func Align(sequence string, barcodeSequence string) (int, string, error) {
m := [][]int{
/* A C G T U */
/* A */ {1, -1, -1, -1, -1},
/* C */ {-1, 1, -1, -1, -1},
/* G */ {-1, -1, 1, -1, -1},
/* T */ {-1, -1, -1, 1, -1},
/* U */ {-1, -1, -1, -1, 1},
}
alphabet := alphabet.NewAlphabet([]string{"A", "C", "G", "T", "U"})
subMatrix, err := matrix.NewSubstitutionMatrix(alphabet, alphabet, m)
if err != nil {
return 0, "", err
}
scoring, err := align.NewScoring(subMatrix, -1)
if err != nil {
return 0, "", err
}
score, alignSequence, _, err := align.SmithWaterman(sequence, barcodeSequence, scoring)
if err != nil {
return 0, "", err
}
return score, alignSequence, nil
}
I don't think I need trim barcodes right now, with megamash now being a thing. So I'm deleting this block of code, but want to maintain it here for posterity.
Mostly, I just want to add docs for megamash, then this is ready.
This PR adds some functions for handling / analyzing DNA sequencing.