/* Viterbi filter implementation; SSE version.
* 
* This is a SIMD vectorized, striped, interleaved, one-row, reduced
 * precision (epi16) implementation of the Viterbi algorithm.
* 
* It calculates a close approximation of the Viterbi score, in
 * limited precision (signed words: 16 bits) and range. It may overflow on
* high scoring sequences, but this indicates that the sequence is a
* high-scoring hit worth examining more closely anyway.  It will not
* underflow, in local alignment mode.
* 
* Contents:
*   1. Viterbi filter implementation.
*   6. Copyright and license information
* 
* SRE, Thu Jul 31 20:32:25 2008 [Casa de Gatos]
 * SVN $Id: vitfilter.c 3018 2009-10-29 17:33:06Z farrarm $
*/

#include <hmmer3/p7_config.h>

#include <stdio.h>
#include <math.h>

#include <xmmintrin.h>		/* SSE  */
#include <emmintrin.h>		/* SSE2 */

#include <hmmer3/easel/easel.h>
#include <hmmer3/easel/esl_sse.h>

#include <hmmer3/hmmer.h>
#include <hmmer3/impl_sse/impl_sse.h>


/*****************************************************************
* 1. Viterbi filter implementation.
*****************************************************************/

/* Function:  p7_ViterbiFilter()
* Synopsis:  Calculates Viterbi score, vewy vewy fast, in limited precision.
* Incept:    SRE, Tue Nov 27 09:15:24 2007 [Janelia]
*
* Purpose:   Calculates an approximation of the Viterbi score for sequence
*            <dsq> of length <L> residues, using optimized profile <om>,
*            and a preallocated one-row DP matrix <ox>. Return the 
*            estimated Viterbi score (in nats) in <ret_sc>.
*            
*            Score may overflow (and will, on high-scoring
*            sequences), but will not underflow. 
*            
*            The model must be in a local alignment mode; other modes
*            cannot provide the necessary guarantee of no underflow.
*            
*            This is a striped SIMD Viterbi implementation using Intel
*            SSE/SSE2 integer intrinsics \citep{Farrar07}, in reduced
*            precision (signed words, 16 bits).
*
* Args:      dsq     - digital target sequence, 1..L
*            L       - length of dsq in residues          
*            om      - optimized profile
*            ox      - DP matrix
*            ret_sc  - RETURN: Viterbi score (in nats)          
*
* Returns:   <eslOK> on success;
*            <eslERANGE> if the score overflows; in this case
*            <*ret_sc> is <eslINFINITY>, and the sequence can 
*            be treated as a high-scoring hit.
*
* Throws:    <eslEINVAL> if <ox> allocation is too small, or if
*            profile isn't in a local alignment mode. (Must be in local
*            alignment mode because that's what helps us guarantee 
*            limited dynamic range.)
*
* Xref:      [Farrar07] for ideas behind striped SIMD DP.
*            J2/46-47 for layout of HMMER's striped SIMD DP.
*            J2/50 for single row DP.
*            J2/60 for reduced precision (epu8)
*            J2/65 for initial benchmarking
*            J2/66 for precision maximization
*            J4/138-140 for reimplementation in 16-bit precision
*/
int
p7_ViterbiFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc, int percentBorder, U2::TaskStateInfo & ti )
{
    register __m128i mpv, dpv, ipv;  /* previous row values                                       */
    register __m128i sv;		   /* temp storage of 1 curr row value in progress              */
    register __m128i dcv;		   /* delayed storage of D(i,q+1)                               */
    register __m128i xEv;		   /* E state: keeps max for Mk->E as we go                     */
    register __m128i xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
    register __m128i Dmaxv;          /* keeps track of maximum D cell on row                      */
    int16_t  xE, xB, xC, xJ, xN;	   /* special states' scores                                    */
    int16_t  Dmax;		   /* maximum D cell score on row                               */
    int i;			   /* counter over sequence positions 1..L                      */
    int q;			   /* counter over vectors 0..nq-1                              */
    int Q        = p7O_NQW(om->M);   /* segment length: # of vectors                              */
    __m128i *dp  = ox->dpw[0];	   /* using {MDI}MX(q) macro requires initialization of <dp>    */
    __m128i *rsc;			   /* will point at om->ru[x] for residue x[i]                  */
    __m128i *tsc;			   /* will point into (and step thru) om->tu                    */

  __m128i negInfv;

    /* Check that the DP matrix is ok for us. */
    if (Q > ox->allocQ8)                                 ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
    if (om->mode != p7_LOCAL && om->mode != p7_UNILOCAL) ESL_EXCEPTION(eslEINVAL, "Fast filter only works for local alignment");
    ox->M   = om->M;

  /* -infinity is -32768 */
  negInfv = _mm_set1_epi16(-32768);
  negInfv = _mm_srli_si128(negInfv, 14);

    /* Initialization. In unsigned arithmetic, -infinity is -32768
    */
    for (q = 0; q < Q; q++)
        MMXo(q) = IMXo(q) = DMXo(q) = _mm_set1_epi16(-32768);
    xN   = om->base_w;
    xB   = xN + om->xw[p7O_N][p7O_MOVE];
    xJ   = -32768;
    xC   = -32768;
    xE   = -32768;

#if p7_DEBUGGING
    if (ox->debugging) p7_omx_DumpVFRow(ox, 0, xE, 0, xJ, xB, xC); /* first 0 is <rowi>: do header. second 0 is xN: always 0 here. */
#endif

    // ! ADDED CODE !
	int progressStart = ti.progress;
    for (i = 1; i <= L; i++)
    {
        // ! ADDED CODE !
		ti.progress = progressStart + (int)(((double)percentBorder / L) * i);
        if( ti.cancelFlag ){ return eslCANCELED; }

        rsc   = om->rwv[dsq[i]];
        tsc   = om->twv;
        dcv   = _mm_set1_epi16(-32768);      /* "-infinity" */
        xEv   = _mm_set1_epi16(-32768);     
        Dmaxv = _mm_set1_epi16(-32768);     
        xBv   = _mm_set1_epi16(xB);

        /* Right shifts by 1 value (2 bytes). 4,8,12,x becomes x,4,8,12. 
        * Because ia32 is littlendian, this means a left bit shift.
        * Zeros shift on automatically; replace it with -32768.
        */
      mpv = MMXo(Q-1);  mpv = _mm_slli_si128(mpv, 2);  mpv = _mm_or_si128(mpv, negInfv);
      dpv = DMXo(Q-1);  dpv = _mm_slli_si128(dpv, 2);  dpv = _mm_or_si128(dpv, negInfv);
      ipv = IMXo(Q-1);  ipv = _mm_slli_si128(ipv, 2);  ipv = _mm_or_si128(ipv, negInfv);

        for (q = 0; q < Q; q++)
        {
            /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
            sv   =                    _mm_adds_epi16(xBv, *tsc);  tsc++;
            sv   = _mm_max_epi16 (sv, _mm_adds_epi16(mpv, *tsc)); tsc++;
            sv   = _mm_max_epi16 (sv, _mm_adds_epi16(ipv, *tsc)); tsc++;
            sv   = _mm_max_epi16 (sv, _mm_adds_epi16(dpv, *tsc)); tsc++;
            sv   = _mm_adds_epi16(sv, *rsc);                      rsc++;
            xEv  = _mm_max_epi16(xEv, sv);

            /* Load {MDI}(i-1,q) into mpv, dpv, ipv;
            * {MDI}MX(q) is then the current, not the prev row
            */
            mpv = MMXo(q);
            dpv = DMXo(q);
            ipv = IMXo(q);

            /* Do the delayed stores of {MD}(i,q) now that memory is usable */
            MMXo(q) = sv;
            DMXo(q) = dcv;

            /* Calculate the next D(i,q+1) partially: M->D only;
            * delay storage, holding it in dcv
            */
            dcv   = _mm_adds_epi16(sv, *tsc);  tsc++;
            Dmaxv = _mm_max_epi16(dcv, Dmaxv);

            /* Calculate and store I(i,q) */
            sv     =                    _mm_adds_epi16(mpv, *tsc);  tsc++;
            IMXo(q)= _mm_max_epi16 (sv, _mm_adds_epi16(ipv, *tsc)); tsc++;
        }	  

        /* Now the "special" states, which start from Mk->E (->C, ->J->B) */
        xE = esl_sse_hmax_epi16(xEv);
        if (xE >= 32767) { *ret_sc = eslINFINITY; return eslERANGE; }	/* immediately detect overflow */
        xN = xN + om->xw[p7O_N][p7O_LOOP];
        xC = ESL_MAX(xC + om->xw[p7O_C][p7O_LOOP], xE + om->xw[p7O_E][p7O_MOVE]);
        xJ = ESL_MAX(xJ + om->xw[p7O_J][p7O_LOOP], xE + om->xw[p7O_E][p7O_LOOP]);
        xB = ESL_MAX(xJ + om->xw[p7O_J][p7O_MOVE], xN + om->xw[p7O_N][p7O_MOVE]);
        /* and now xB will carry over into next i, and xC carries over after i=L */

        /* Finally the "lazy F" loop (sensu [Farrar07]). We can often
        * prove that we don't need to evaluate any D->D paths at all.
        *
        * The observation is that if we can show that on the next row,
        * B->M(i+1,k) paths always dominate M->D->...->D->M(i+1,k) paths
        * for all k, then we don't need any D->D calculations.
        * 
        * The test condition is:
        *      max_k D(i,k) + max_k ( TDD(k-2) + TDM(k-1) - TBM(k) ) < xB(i)
        * So:
        *   max_k (TDD(k-2) + TDM(k-1) - TBM(k)) is precalc'ed in om->dd_bound;
        *   max_k D(i,k) is why we tracked Dmaxv;
        *   xB(i) was just calculated above.
        */
        Dmax = esl_sse_hmax_epi16(Dmaxv);
        if (Dmax + om->ddbound_w > xB) 
        {
            /* Now we're obligated to do at least one complete DD path to be sure. */
            /* dcv has carried through from end of q loop above */
            dcv = _mm_slli_si128(dcv, 2); 
	  dcv = _mm_or_si128(dcv, negInfv);
	  tsc = om->twv + 7*Q;	/* set tsc to start of the DD's */
            for (q = 0; q < Q; q++) 
            {
                DMXo(q) = _mm_max_epi16(dcv, DMXo(q));	
                dcv     = _mm_adds_epi16(DMXo(q), *tsc); tsc++;
            }

            /* We may have to do up to three more passes; the check
            * is for whether crossing a segment boundary can improve
            * our score. 
            */
            do {
                dcv = _mm_slli_si128(dcv, 2);
	    dcv = _mm_or_si128(dcv, negInfv);
	    tsc = om->twv + 7*Q;	/* set tsc to start of the DD's */
                for (q = 0; q < Q; q++) 
                {
                    if (! esl_sse_any_gt_epi16(dcv, DMXo(q))) break;
                    DMXo(q) = _mm_max_epi16(dcv, DMXo(q));	
                    dcv     = _mm_adds_epi16(DMXo(q), *tsc);   tsc++;
                }	    
            } while (q == Q);
        }
        else  /* not calculating DD? then just store the last M->D vector calc'ed.*/
	{
	  dcv = _mm_slli_si128(dcv, 2);
	  DMXo(0) = _mm_or_si128(dcv, negInfv);
	}

#if p7_DEBUGGING
        if (ox->debugging) p7_omx_DumpVFRow(ox, i, xE, 0, xJ, xB, xC);   
#endif
    } /* end loop over sequence residues 1..L */

    /* finally C->T */
    if (xC > -32768)
    {
        *ret_sc = (float) xC + (float) om->xw[p7O_C][p7O_MOVE] - (float) om->base_w;
        /* *ret_sc += L * om->ncj_roundoff;  see J4/150 for rationale: superceded by -3.0nat approximation*/
        *ret_sc /= om->scale_w;
        *ret_sc -= 3.0; /* the NN/CC/JJ=0,-3nat approximation: see J5/36. That's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ contrib */
    }
    else  *ret_sc = -eslINFINITY;
    return eslOK;
}
/*---------------- end, p7_ViterbiFilter() ----------------------*/

/************************************************************
* HMMER - Biological sequence analysis with profile HMMs
* Version 3.0; March 2010
* Copyright (C) 2010 Howard Hughes Medical Institute.
* Other copyrights also apply. See the COPYRIGHT file for a full list.
* 
* HMMER is distributed under the terms of the GNU General Public License
* (GPLv3). See the LICENSE file for details.
************************************************************/
