attractivechaos / klib

A standalone and lightweight C library
http://attractivechaos.github.io/klib/
MIT License
4.18k stars 556 forks source link

Added a ksw_global_end function. #92

Open jkbonfield opened 6 years ago

jkbonfield commented 6 years ago

This is as per ksw_global, but permits control over whether gaps at the start and/or end of query and target sequences are penalised.

For experimentation I also added another test main() function, although I'm happy for this to be dropped of course. To demonstrate it I compare GTC vs GATTTTTC and GTTTTTAC, in either order. Initially I penalise end gaps ($e == 1 in my command line):

@ seq3c[work/klib]; e=1;./a.out GTC GATTTTTC $e $e $e $e 3 1 -2 5 0;./a.out  GATTTTTC GTC $e $e $e $e 3 1 -2 5 0;./a.out GTC GTTTTTAC $e $e $e $e 3 1 -2 5 0;./a.out GTTTTTAC GTC $e $e $e $e 3 1 -2 5 6;
Score=7: 1M 5D 2M 
G G
- A
- T
- T
- T
- T
T T
C C
Score=7: 1M 5I 2M 
G G
A -
T -
T -
T -
T -
T T
C C
Score=7: 2M 5D 1M 
G G
T T
- T
- T
- T
- T
- A
C C
Score=7: 2M 5I 1M 
G G
T T
T -
T -
T -
T -
A -
C C

In every case we see a large gap and align the few bases either side. With $e == 0 so that end gaps aren't penalised, the score is higher and we see a mismatch preferred instead:

@ seq3c[work/klib]; e=0;./a.out GTC GATTTTTC $e $e $e $e 3 1 -2 5 0;./a.out  GATTTTTC GTC $e $e $e $e 3 1 -2 5 0;./a.out GTC GTTTTTAC $e $e $e $e 3 1 -2 5 0;./a.out GTTTTTAC GTC $e $e $e $e 3 1 -2 5 6;
Score=8: 5D 3M 
- G
- A
- T
- T
- T
G T
T T
C C
Score=8: 5I 3M 
G -
A -
T -
T -
T -
T G
T T
C C
Score=8: 3M 5D 
G G
T T
C T
- T
- T
- T
- A
- C
Score=8: 3M 5I 
G G
T T
T C
T -
T -
T -
A -
C -

Note I cannot explain the alignments I get from the existing ksw_global function. Is my test harness processing this correctly? If I call ksw_global instead of ksw_global_end in my main function (see the commented out line) then I get an invalid score and truncated alignments in some of these combinations. I don't know why this is so.

A possible improvement is to express ksw_global in terms of ksw_global_end (it's just 4 extra arguments all set to 1), but I left it as-is for you to decide whether the slight performance difference is worth it.

James

jkbonfield commented 6 years ago

I appreciate this isn't easy to study, so if you wish to see what the differences are between the functions, then here is what it looks like with ksw_global replaced by ksw_global_end:

-int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+/**
+ * Global alignment with the ability to control which start/end gaps
+ * are penalised for.  Otherwise identical to ksw_global.
+ *
+ * @param s_q   Boolean: whether to count gaps at start of query
+ * @param e_q   Boolean: whether to count gaps at end   of query
+ * @param s_t   Boolean: whether to count gaps at start of target
+ * @param e_t   Boolean: whether to count gaps at end   of target
+ */
+int ksw_global_end(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat,
+                  int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_, int s_q, int e_q, int s_t, int e_t)
 {
        eh_t *eh;
        int8_t *qp; // query profile
@@ -462,6 +472,7 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
        uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
        if (n_cigar_) *n_cigar_ = 0;
        // allocate memory
+       if (w == 0) w = qlen > tlen ? qlen : tlen;
        n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
        z = malloc(n_col * tlen);
        qp = malloc(qlen * m);
@@ -474,16 +485,17 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
        // fill the first row
        eh[0].h = 0; eh[0].e = MINUS_INF;
        for (j = 1; j <= qlen && j <= w; ++j)
-               eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
+               eh[j].h = s_t ? -(gapo + gape * j) : 0, eh[j].e = MINUS_INF;
        for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
        // DP loop
+       int32_t beg, best_score = MINUS_INF, best_i = 0;
        for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
-               int32_t f = MINUS_INF, h1, beg, end;
+               int32_t f = MINUS_INF, h1, end;
                uint8_t *zi = &z[i * n_col];
                beg = i > w? i - w : 0;
                end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
-               h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
+               h1 = beg == 0? (s_q ? -(gapo + gape * (i + 1)) : 0) : MINUS_INF;
                for (j = beg; LIKELY(j < end); ++j) {
                        // This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
                        // 1) not checking h>0; 2) recording direction for backtracking
@@ -508,12 +520,34 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
                        zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
                }
                eh[end].h = h1; eh[end].e = MINUS_INF;
+               if (best_score < eh[end].h)
+                       best_score = eh[end].h, best_i = i;
        }
        score = eh[qlen].h;
+       i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+       int n_cigar = 0, m_cigar = 0;
+       uint32_t *cigar = NULL;
+       if (!e_t) {
+               if (score < best_score) {
+                       score = best_score;
+                       if (best_i < i && n_cigar_ && cigar_)
+                               cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i - best_i);
+                       i = best_i;
+               }
+       }
+       if (!e_q) {
+               for (j = beg; j <= qlen; j++) {
+                       if (score < eh[j].h) {
+                               score = eh[j].h;
+                               if (j < k && n_cigar_ && cigar_)
+                                       cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k-j);
+                               k = j;
+                       }
+               }
+       }
        if (n_cigar_ && cigar_) { // backtrack
-               int n_cigar = 0, m_cigar = 0, which = 0;
-               uint32_t *cigar = 0, tmp;
-               i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+               int which = 0;
+               uint32_t tmp;
                while (i >= 0 && k >= 0) {
                        which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
                        if (which == 0)      cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;

I confess looking at it again this way I'm unsure of the band calculation. It may be should only set best_score when within 'w' of tlen, but it's hard to understand.