Open jkbonfield opened 6 years ago
I appreciate this isn't easy to study, so if you wish to see what the differences are between the functions, then here is what it looks like with ksw_global replaced by ksw_global_end:
-int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+/**
+ * Global alignment with the ability to control which start/end gaps
+ * are penalised for. Otherwise identical to ksw_global.
+ *
+ * @param s_q Boolean: whether to count gaps at start of query
+ * @param e_q Boolean: whether to count gaps at end of query
+ * @param s_t Boolean: whether to count gaps at start of target
+ * @param e_t Boolean: whether to count gaps at end of target
+ */
+int ksw_global_end(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat,
+ int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_, int s_q, int e_q, int s_t, int e_t)
{
eh_t *eh;
int8_t *qp; // query profile
@@ -462,6 +472,7 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
if (n_cigar_) *n_cigar_ = 0;
// allocate memory
+ if (w == 0) w = qlen > tlen ? qlen : tlen;
n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
z = malloc(n_col * tlen);
qp = malloc(qlen * m);
@@ -474,16 +485,17 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
// fill the first row
eh[0].h = 0; eh[0].e = MINUS_INF;
for (j = 1; j <= qlen && j <= w; ++j)
- eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
+ eh[j].h = s_t ? -(gapo + gape * j) : 0, eh[j].e = MINUS_INF;
for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
// DP loop
+ int32_t beg, best_score = MINUS_INF, best_i = 0;
for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
- int32_t f = MINUS_INF, h1, beg, end;
+ int32_t f = MINUS_INF, h1, end;
uint8_t *zi = &z[i * n_col];
beg = i > w? i - w : 0;
end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
- h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
+ h1 = beg == 0? (s_q ? -(gapo + gape * (i + 1)) : 0) : MINUS_INF;
for (j = beg; LIKELY(j < end); ++j) {
// This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
// 1) not checking h>0; 2) recording direction for backtracking
@@ -508,12 +520,34 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
}
eh[end].h = h1; eh[end].e = MINUS_INF;
+ if (best_score < eh[end].h)
+ best_score = eh[end].h, best_i = i;
}
score = eh[qlen].h;
+ i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+ int n_cigar = 0, m_cigar = 0;
+ uint32_t *cigar = NULL;
+ if (!e_t) {
+ if (score < best_score) {
+ score = best_score;
+ if (best_i < i && n_cigar_ && cigar_)
+ cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i - best_i);
+ i = best_i;
+ }
+ }
+ if (!e_q) {
+ for (j = beg; j <= qlen; j++) {
+ if (score < eh[j].h) {
+ score = eh[j].h;
+ if (j < k && n_cigar_ && cigar_)
+ cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k-j);
+ k = j;
+ }
+ }
+ }
if (n_cigar_ && cigar_) { // backtrack
- int n_cigar = 0, m_cigar = 0, which = 0;
- uint32_t *cigar = 0, tmp;
- i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+ int which = 0;
+ uint32_t tmp;
while (i >= 0 && k >= 0) {
which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
I confess looking at it again this way I'm unsure of the band calculation. It may be should only set best_score when within 'w' of tlen, but it's hard to understand.
This is as per ksw_global, but permits control over whether gaps at the start and/or end of query and target sequences are penalised.
For experimentation I also added another test main() function, although I'm happy for this to be dropped of course. To demonstrate it I compare GTC vs GATTTTTC and GTTTTTAC, in either order. Initially I penalise end gaps ($e == 1 in my command line):
In every case we see a large gap and align the few bases either side. With $e == 0 so that end gaps aren't penalised, the score is higher and we see a mismatch preferred instead:
Note I cannot explain the alignments I get from the existing ksw_global function. Is my test harness processing this correctly? If I call ksw_global instead of ksw_global_end in my main function (see the commented out line) then I get an invalid score and truncated alignments in some of these combinations. I don't know why this is so.
A possible improvement is to express ksw_global in terms of ksw_global_end (it's just 4 extra arguments all set to 1), but I left it as-is for you to decide whether the slight performance difference is worth it.
James