14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* K=7 r=1/2 Viterbi decoder for SSE2 24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi * Feb 2004, Phil Karn, KA9Q 34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi */ 44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <stdio.h> 54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <stdlib.h> 64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <memory.h> 74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include <xmmintrin.h> 84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#include "fec.h" 94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yitypedef union { unsigned char c[64]; __m128i v[4]; } metric_t; 114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yitypedef union { unsigned long w[2]; unsigned char c[8]; unsigned short s[4]; __m64 v[1];} decision_t; 124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiunion branchtab27 { unsigned char c[32]; __m128i v[2];} Branchtab27_sse2[2]; 134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yistatic int Init = 0; 144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* State info for instance of Viterbi decoder 164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi * Don't change this without also changing references in sse2bfly27.s! 174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi */ 184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yistruct v27 { 194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi metric_t metrics1; /* path metric buffer 1 */ 204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi metric_t metrics2; /* path metric buffer 2 */ 214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decision_t *dp; /* Pointer to current decision */ 224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ 234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decision_t *decisions; /* Beginning of decisions for block */ 244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi}; 254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Initialize Viterbi decoder for start of new frame */ 274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiint init_viterbi27_sse2(void *p,int starting_state){ 284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi struct v27 *vp = p; 294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi int i; 304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi if(p == NULL) 324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi return -1; 334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi for(i=0;i<64;i++) 344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->metrics1.c[i] = 63; 354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->old_metrics = &vp->metrics1; 374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->new_metrics = &vp->metrics2; 384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->dp = vp->decisions; 394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ 404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi return 0; 414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi} 424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yivoid set_viterbi27_polynomial_sse2(int polys[2]){ 444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi int state; 454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi for(state=0;state < 32;state++){ 474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; 484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; 494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi } 504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi Init++; 514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi} 524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Create a new instance of a Viterbi decoder */ 554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yivoid *create_viterbi27_sse2(int len){ 564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi void *p; 574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi struct v27 *vp; 584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi if(!Init){ 604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi int polys[2] = { V27POLYA, V27POLYB }; 614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi set_viterbi27_polynomial_sse2(polys); 624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi } 634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ 644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v27))) 654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi return NULL; 664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp = (struct v27 *)p; 674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi if((p = malloc((len+6)*sizeof(decision_t))) == NULL){ 694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi free(vp); 704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi return NULL; 714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi } 724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->decisions = (decision_t *)p; 734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi init_viterbi27_sse2(vp,0); 744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi return vp; 764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi} 774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Viterbi chainback */ 794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiint chainback_viterbi27_sse2( 804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi void *p, 814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi unsigned char *data, /* Decoded output data */ 824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi unsigned int nbits, /* Number of data bits */ 834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi unsigned int endstate){ /* Terminal encoder state */ 844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi struct v27 *vp = p; 854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decision_t *d; 864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi if(p == NULL) 884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi return -1; 894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi d = vp->decisions; 904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Make room beyond the end of the encoder register so we can 914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi * accumulate a full byte of decoded data 924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi */ 934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi endstate %= 64; 944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi endstate <<= 2; 954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* The store into data[] only needs to be done every 8 bits. 974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi * But this avoids a conditional branch, and the writes will 984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi * combine in the cache anyway 994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi */ 1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi d += 6; /* Look past tail */ 1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi while(nbits-- != 0){ 1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi int k; 1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; 1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); 1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi } 1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi return 0; 1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi} 1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Delete instance of a Viterbi decoder */ 1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yivoid delete_viterbi27_sse2(void *p){ 1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi struct v27 *vp = p; 1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi if(vp != NULL){ 1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi free(vp->decisions); 1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi free(vp); 1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi } 1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi} 1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#if 0 1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */ 1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yivoid update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits){ 1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi struct v27 *vp = p; 1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decision_t *d; 1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi if(p == NULL) 1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi return; 1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi d = (decision_t *)vp->dp; 1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi while(nbits--){ 1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi __m128i sym0v,sym1v; 1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi void *tmp; 1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi int i; 1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ 1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi sym0v = _mm_set1_epi8(syms[0]); 1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi sym1v = _mm_set1_epi8(syms[1]); 1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi syms += 2; 1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi for(i=0;i<2;i++){ 1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; 1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Form branch metrics */ 1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i],sym0v),_mm_xor_si128(Branchtab27_sse2[1].v[i],sym1v)); 1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* There's no packed bytes right shift in SSE2, so we use the word version and mask 1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi * (I'm *really* starting to like Altivec...) 1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi */ 1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi metric = _mm_srli_epi16(metric,3); 1494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi metric = _mm_and_si128(metric,_mm_set1_epi8(31)); 1504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi m_metric = _mm_sub_epi8(_mm_set1_epi8(31),metric); 1514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Add branch metrics to path metrics */ 1534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi m0 = _mm_add_epi8(vp->old_metrics->v[i],metric); 1544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi m3 = _mm_add_epi8(vp->old_metrics->v[2+i],metric); 1554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi m1 = _mm_add_epi8(vp->old_metrics->v[2+i],m_metric); 1564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi m2 = _mm_add_epi8(vp->old_metrics->v[i],m_metric); 1574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Compare and select, using modulo arithmetic */ 1594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0,m1),_mm_setzero_si128()); 1604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2,m3),_mm_setzero_si128()); 1614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi survivor0 = _mm_or_si128(_mm_and_si128(decision0,m1),_mm_andnot_si128(decision0,m0)); 1624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi survivor1 = _mm_or_si128(_mm_and_si128(decision1,m3),_mm_andnot_si128(decision1,m2)); 1634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Pack each set of decisions into 16 bits */ 1654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi d->s[2*i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0,decision1)); 1664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi d->s[2*i+1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0,decision1)); 1674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Store surviving metrics */ 1694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->new_metrics->v[2*i] = _mm_unpacklo_epi8(survivor0,survivor1); 1704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi8(survivor0,survivor1); 1714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi } 1724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi d++; 1734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi /* Swap pointers to old and new metrics */ 1744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi tmp = vp->old_metrics; 1754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->old_metrics = vp->new_metrics; 1764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->new_metrics = tmp; 1774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi } 1784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi vp->dp = d; 1794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi} 1804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#endif 181