14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* K=15 r=1/6 Viterbi decoder for x86 SSE2
24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi * Copyright Mar 2004, Phil Karn, KA9Q
34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi * May be used under the terms of the GNU Lesser General Public License (LGPL)
44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi */
54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <emmintrin.h>
64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <stdio.h>
74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <stdlib.h>
84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <memory.h>
94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <limits.h>
104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include "fec.h"
114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yitypedef union { unsigned long w[8]; unsigned short s[16];} decision_t;
134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yitypedef union { signed short s[256]; __m128i v[32];} metric_t;
144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yistatic union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3];
164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yistatic int Init = 0;
174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* State info for instance of Viterbi decoder */
194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yistruct v39 {
204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  metric_t metrics1; /* path metric buffer 1 */
214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  metric_t metrics2; /* path metric buffer 2 */
224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  void *dp;          /* Pointer to current decision */
234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  void *decisions;   /* Beginning of decisions for block */
254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi};
264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Initialize Viterbi decoder for start of new frame */
284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiint init_viterbi39_sse2(void *p,int starting_state){
294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  struct v39 *vp = p;
304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  int i;
314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  for(i=0;i<256;i++)
334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    vp->metrics1.s[i] = (SHRT_MIN+1000);
344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  vp->old_metrics = &vp->metrics1;
364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  vp->new_metrics = &vp->metrics2;
374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  vp->dp = vp->decisions;
384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  return 0;
404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi}
414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Create a new instance of a Viterbi decoder */
434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yivoid *create_viterbi39_sse2(int len){
444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  void *p;
454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  struct v39 *vp;
464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  if(!Init){
484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    set_viterbi39_polynomial_sse2(polys);
514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  }
524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39)))
544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    return NULL;
554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  vp = (struct v39 *)p;
574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    free(vp);
594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    return NULL;
604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  }
614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  vp->decisions = (decision_t *)p;
624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  init_viterbi39_sse2(vp,0);
634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  return vp;
644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi}
654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yivoid set_viterbi39_polynomial_sse2(int polys[3]){
674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  int state;
684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  for(state=0;state < 128;state++){
704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  }
744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  Init++;
754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi}
764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Viterbi chainback */
784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiint chainback_viterbi39_sse2(
794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      void *p,
804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      unsigned char *data, /* Decoded output data */
814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      unsigned int nbits, /* Number of data bits */
824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      unsigned int endstate){ /* Terminal encoder state */
834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  struct v39 *vp = p;
844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  decision_t *d = (decision_t *)vp->decisions;
854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  int path_metric;
864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  endstate %= 256;
884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  path_metric = vp->old_metrics->s[endstate];
904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  /* The store into data[] only needs to be done every 8 bits.
924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   * But this avoids a conditional branch, and the writes will
934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   * combine in the cache anyway
944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   */
954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  d += 8; /* Look past tail */
964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  while(nbits-- != 0){
974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    int k;
984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    endstate = (k << 7) | (endstate >> 1);
1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    data[nbits>>3] = endstate;
1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  }
1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  return path_metric;
1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi}
1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Delete instance of a Viterbi decoder */
1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yivoid delete_viterbi39_sse2(void *p){
1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  struct v39 *vp = p;
1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  if(vp != NULL){
1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    free(vp->decisions);
1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    free(vp);
1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  }
1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi}
1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiint update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){
1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  struct v39 *vp = p;
1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  decision_t *d = (decision_t *)vp->dp;
1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  int path_metric = 0;
1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  while(nbits--){
1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    __m128i sym0v,sym1v,sym2v;
1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    void *tmp;
1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    int i;
1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    sym0v = _mm_set1_epi16(syms[0]);
1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    sym1v = _mm_set1_epi16(syms[1]);
1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    sym2v = _mm_set1_epi16(syms[2]);
1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    syms += 3;
1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    for(i=0;i<16;i++){
1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      /* Form branch metrics
1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi       * the XOR operations constitute conditional negation.
1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi       * metric and m_metric (-metric) are in the range 0-765
1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi       */
1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v));
1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0);
1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric);
1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      /* Add branch metrics to path metrics */
1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric);
1494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric);
1504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
1514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      /* Compare and select */
1534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      survivor0 = _mm_min_epi16(m0,m1);
1544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      survivor1 = _mm_min_epi16(m2,m3);
1554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      decision0 = _mm_cmpeq_epi16(survivor0,m1);
1564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      decision1 = _mm_cmpeq_epi16(survivor1,m3);
1574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
1594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
1604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      /* Store surviving metrics */
1624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
1634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
1644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    }
1654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    /* See if we need to renormalize */
1664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
1674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      int i,adjust;
1684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      __m128i adjustv;
1694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      union { __m128i v; signed short w[8]; } t;
1704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
1724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      adjustv = vp->new_metrics->v[0];
1734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      for(i=1;i<32;i++)
1744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
1754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
1774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
1784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
1794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      t.v = adjustv;
1804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      adjust = t.w[0] - SHRT_MIN;
1814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      path_metric += adjust;
1824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      adjustv = _mm_set1_epi16(adjust);
1834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
1854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi       * This is okay since it can't overflow anyway
1864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi       */
1874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi      for(i=0;i<32;i++)
1884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
1894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    }
1904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    d++;
1914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    /* Swap pointers to old and new metrics */
1924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    tmp = vp->old_metrics;
1934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    vp->old_metrics = vp->new_metrics;
1944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi    vp->new_metrics = tmp;
1954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  }
1964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  vp->dp = d;
1974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi  return path_metric;
1984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi}
1994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
201