Koeng101 / dnadesign

A Go package for designing DNA.
Other
23 stars 0 forks source link

add sequencing functions #49

Closed Koeng101 closed 7 months ago

Koeng101 commented 8 months ago

This PR adds some functions for handling / analyzing DNA sequencing.

Koeng101 commented 8 months ago

Channels should be passing values, not pointers. This needs to be fixed.

Koeng101 commented 8 months ago

Things to do:

Koeng101 commented 7 months ago
func TrimBarcodeWithChannels(barcodeName string, fastqInput <-chan fastq.Read, fastqOutput chan<- fastq.Read) error {
    for {
        select {
        case data, ok := <-fastqInput:
            if !ok {
                // If the input channel is closed, we close the output channel and return
                close(fastqOutput)
                return nil
            }
            trimmedRead, err := TrimBarcode(barcodeName, *data)
            if err != nil {
                close(fastqOutput)
                return err
            }
            fastqOutput <- &trimmedRead
        }
    }

}

// TrimBarcode takes a barcodeName and a fastqSequence and returns a trimmed
// barcode.
func TrimBarcode(barcodeName string, fastqRead fastq.Read) (fastq.Read, error) {
    // Copy into new fastq read
    var newFastqRead fastq.Read
    newFastqRead.Identifier = fastqRead.Identifier
    newFastqRead.Optionals = make(map[string]string)
    for key, value := range fastqRead.Optionals {
        newFastqRead.Optionals[key] = value
    }

    // Get the barcode
    barcode, ok := NativeBarcodeMap[barcodeName]
    if !ok {
        return newFastqRead, fmt.Errorf("barcode %s not found in NativeBarcodeMap.", barcodeName)
    }

    // Trim in the forward direction
    var sequence string
    var quality string
    sequence = fastqRead.Sequence
    quality = fastqRead.Quality
    if len(sequence) > 80 {
        score, alignment, err := Align(sequence[:80], barcode.Forward)
        if err != nil {
            return newFastqRead, err
        }
        // Empirically from looking around, it seems to me that approximately a
        // score of 21 looks like a barcode match. This is almost by definition a
        // magic number, so it is defined above as so.
        if score >= ScoreMagicNumber {
            modifiedAlignment := strings.ReplaceAll(alignment, "-", "")
            index := strings.Index(sequence, modifiedAlignment)
            // The index needs to be within the first 80 base pairs. Usually the
            // native adapter + native barcode is within the first ~70 bp, but we
            // put a small buffer.
            if index != -1 {
                // 7 here is a magic number between the native barcode and your
                // target sequence. It's just a number that exists irl, so it is
                // not defined above.
                newStart := index + 7
                if newStart < len(sequence) {
                    sequence = sequence[newStart:]
                    quality = quality[newStart:]
                }
            }
        }
    }

    // Now do the same thing with the reverse strand.
    if len(sequence) > 80 {
        score, alignment, err := Align(sequence[len(sequence)-80:], barcode.Reverse)
        if err != nil {
            return newFastqRead, err
        }
        if score >= ScoreMagicNumber {
            modifiedAlignment := strings.ReplaceAll(alignment, "-", "")
            index := strings.Index(sequence, modifiedAlignment)
            // This time we need to check within the last 80 base pairs.
            if index != -1 {
                newEnd := index - 7
                if newEnd < len(sequence) {
                    sequence = sequence[:newEnd]
                    quality = quality[:newEnd]
                }
            }
        }
    }
    newFastqRead.Sequence = sequence
    newFastqRead.Quality = quality
    return newFastqRead, nil
}

// Align uses the SmithWaterman algorithm to align a barcode to a sequence.
// It is used because it is a rather simple algorithm, and since barcodes are
// sufficiently small, fast enough.
func Align(sequence string, barcodeSequence string) (int, string, error) {
    m := [][]int{
        /*       A C G T U */
        /* A */ {1, -1, -1, -1, -1},
        /* C */ {-1, 1, -1, -1, -1},
        /* G */ {-1, -1, 1, -1, -1},
        /* T */ {-1, -1, -1, 1, -1},
        /* U */ {-1, -1, -1, -1, 1},
    }

    alphabet := alphabet.NewAlphabet([]string{"A", "C", "G", "T", "U"})
    subMatrix, err := matrix.NewSubstitutionMatrix(alphabet, alphabet, m)
    if err != nil {
        return 0, "", err
    }
    scoring, err := align.NewScoring(subMatrix, -1)
    if err != nil {
        return 0, "", err
    }
    score, alignSequence, _, err := align.SmithWaterman(sequence, barcodeSequence, scoring)
    if err != nil {
        return 0, "", err
    }
    return score, alignSequence, nil
}

I don't think I need trim barcodes right now, with megamash now being a thing. So I'm deleting this block of code, but want to maintain it here for posterity.

Koeng101 commented 7 months ago

Mostly, I just want to add docs for megamash, then this is ready.