14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies 24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi for 64-state (k=7) convolutional code 34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi Copyright 2003 Phil Karn, KA9Q 44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi This code may be used under the terms of the GNU Lesser General Public License (LGPL) 54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ; 74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi*/ 84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # SSE2 (128-bit integer SIMD) version 94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Requires Pentium 4 or better 104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # These are offsets into struct v27, defined in viterbi27.h 124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set DP,128 134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set OLDMETRICS,132 144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set NEWMETRICS,136 154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .text 164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .global update_viterbi27_blk_sse2,Branchtab27_sse2 174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .type update_viterbi27_blk_sse2,@function 184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 16 194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiupdate_viterbi27_blk_sse2: 214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %ebp 224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esp,%ebp 234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %esi 244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %edi 254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %edx 264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %ebx 274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 8(%ebp),%edx # edx = vp 294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi testl %edx,%edx 304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jnz 0f 314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl -1,%eax 324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jmp err 334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi0: movl OLDMETRICS(%edx),%esi # esi -> old metrics 344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl NEWMETRICS(%edx),%edi # edi -> new metrics 354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl DP(%edx),%edx # edx -> decisions 364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1: movl 16(%ebp),%eax # eax = nbits 384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decl %eax 394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jl 2f # passed zero, we're done 404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %eax,16(%ebp) 414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi xorl %eax,%eax 434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 12(%ebp),%ebx # ebx = syms 444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movb (%ebx),%al 454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movd %eax,%xmm6 # xmm6[0] = first symbol 464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movb 1(%ebx),%al 474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movd %eax,%xmm5 # xmm5[0] = second symbol 484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi addl $2,%ebx 494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %ebx,12(%ebp) 504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] 524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %xmm5,%xmm5 534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pshuflw $0,%xmm5,%xmm5 554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklqdq %xmm6,%xmm6 # propagate to all 16 564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklqdq %xmm5,%xmm5 574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # xmm6 now contains first symbol in each byte, xmm5 the second 584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa thirtyones,%xmm7 604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # each invocation of this macro does 16 butterflies in parallel 624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .MACRO butterfly GROUP 634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # compute branch metrics 644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa Branchtab27_sse2+(16*\GROUP),%xmm4 654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3 664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %xmm6,%xmm4 674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %xmm5,%xmm3 684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # compute 5-bit branch metric in xmm4 by adding the individual symbol metrics 704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # This is okay for this 714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # code because the worst-case metric spread (at high Eb/No) is only 120, 724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # well within the range of our unsigned 8-bit path metrics, and even within 734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # the range of signed 8-bit path metrics 744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pavgb %xmm3,%xmm4 754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlw $3,%xmm4 764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pand %xmm7,%xmm4 784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa ((16*\GROUP)+32)(%esi),%xmm3 # Incoming path metric, high bit = 1 814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm0,%xmm2 824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm3,%xmm1 834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddusb %xmm4,%xmm0 # note use of saturating arithmetic 844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddusb %xmm4,%xmm3 # this shouldn't be necessary, but why not? 854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # negate branch metrics 874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %xmm7,%xmm4 884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddusb %xmm4,%xmm1 894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddusb %xmm4,%xmm2 904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Find survivors, leave in mm0,2 924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm1,%xmm0 934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm3,%xmm2 944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # get decisions, leave in mm1,3 954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pcmpeqb %xmm0,%xmm1 964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pcmpeqb %xmm2,%xmm3 974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # interleave and store new branch metrics in mm0,2 994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm0,%xmm4 1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics 1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics 1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm0,(32*\GROUP+16)(%edi) 1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm4,(32*\GROUP)(%edi) 1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # interleave decisions & store 1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm1,%xmm4 1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpckhbw %xmm3,%xmm1 1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %xmm3,%xmm4 1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # work around bug in gas due to Intel doc error 1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx 1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi shll $16,%ebx 1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax 1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi orl %eax,%ebx 1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %ebx,(4*\GROUP)(%edx) 1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .endm 1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # invoke macro 2 times for a total of 32 butterflies 1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=0 1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=1 1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi addl $8,%edx # bump decision pointer 1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # See if we have to normalize. This requires an explanation. We don't want 1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # our path metrics to exceed 255 on the *next* iteration. Since the 1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # largest branch metric is 30, that means we don't want any to exceed 225 1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # on *this* iteration. Rather than look them all, we just pick an arbitrary one 1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # (the first) and see if it exceeds 225-120=105, where 120 is the experimentally- 1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # determined worst-case metric spread for this code and branch metrics in the range 0-30. 1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # This is extremely conservative, and empirical testing at a variety of Eb/Nos might 1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # show that a higher threshold could be used without affecting BER performance 1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl (%edi),%eax # extract first output metric 1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi andl $255,%eax 1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi cmp $105,%eax 1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jle done # No, no need to normalize 1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Normalize by finding smallest metric and subtracting it 1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # from all metrics. We can't just pick an arbitrary small constant because 1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # the minimum metric might be zero! 1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa (%edi),%xmm0 1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm0,%xmm4 1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa 16(%edi),%xmm1 1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm1,%xmm4 1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa 32(%edi),%xmm2 1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm2,%xmm4 1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa 48(%edi),%xmm3 1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm3,%xmm4 1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # crunch down to single lowest metric 1504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm4,%xmm5 1514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrldq $8,%xmm5 # the count to psrldq is bytes, not bits! 1524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm5,%xmm4 1534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm4,%xmm5 1544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlq $32,%xmm5 1554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm5,%xmm4 1564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm4,%xmm5 1574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlq $16,%xmm5 1584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm5,%xmm4 1594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm4,%xmm5 1604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlq $8,%xmm5 1614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %xmm5,%xmm4 # now in lowest byte of %xmm4 1624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %xmm4,%xmm4 # lowest 2 bytes 1644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pshuflw $0,%xmm4,%xmm4 # lowest 8 bytes 1654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklqdq %xmm4,%xmm4 # all 16 bytes 1664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # xmm4 now contains lowest metric in all 16 bytes 1684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # subtract it from every output metric 1694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psubusb %xmm4,%xmm0 1704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psubusb %xmm4,%xmm1 1714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psubusb %xmm4,%xmm2 1724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psubusb %xmm4,%xmm3 1734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm0,(%edi) 1744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm1,16(%edi) 1754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm2,32(%edi) 1764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movdqa %xmm3,48(%edi) 1774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yidone: 1794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # swap metrics 1804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esi,%eax 1814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edi,%esi 1824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %eax,%edi 1834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jmp 1b 1844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2: movl 8(%ebp),%ebx # ebx = vp 1864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # stash metric pointers 1874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esi,OLDMETRICS(%ebx) 1884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edi,NEWMETRICS(%ebx) 1894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edx,DP(%ebx) # stash incremented value of vp->dp 1904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi xorl %eax,%eax 1914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yierr: popl %ebx 1924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %edx 1934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %edi 1944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %esi 1954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %ebp 1964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi ret 1974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .data 1994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 16 2004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 2014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yithirtyones: 2024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 203