14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies 24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi for 64-state (k=7) convolutional code 34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi Copyright 2001 Phil Karn, KA9Q 44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi This code may be used under the terms of the GNU Lesser General Public License (LGPL) 54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ; 74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi*/ 84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # SSE (64-bit integer SIMD) version 104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Requires Pentium III or better 114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # These are offsets into struct v27, defined in viterbi27.h 134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set DP,128 144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set OLDMETRICS,132 154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set NEWMETRICS,136 164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi.text 174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi.global update_viterbi27_blk_sse,Branchtab27_sse 184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .type update_viterbi27_blk_sse,@function 194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 16 204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiupdate_viterbi27_blk_sse: 224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %ebp 234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esp,%ebp 244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %esi 254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %edi 264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %edx 274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %ebx 284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 8(%ebp),%edx # edx = vp 304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi testl %edx,%edx 314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jnz 0f 324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl -1,%eax 334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jmp err 344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi0: movl OLDMETRICS(%edx),%esi # esi -> old metrics 354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl NEWMETRICS(%edx),%edi # edi -> new metrics 364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl DP(%edx),%edx # edx -> decisions 374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1: movl 16(%ebp),%eax # eax = nbits 394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decl %eax 404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jl 2f # passed zero, we're done 414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %eax,16(%ebp) 424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi xorl %eax,%eax 444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 12(%ebp),%ebx # %ebx = syms 454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movb (%ebx),%al 464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movd %eax,%mm6 # mm6[0] = first symbol 474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movb 1(%ebx),%al 484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movd %eax,%mm5 # mm5[0] = second symbol 494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi addl $2,%ebx 504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %ebx,12(%ebp) 514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] 534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %mm5,%mm5 544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq thirtyones,%mm7 554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pshufw $0,%mm6,%mm6 # copy low word to upper 3 574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pshufw $0,%mm5,%mm5 584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # mm6 now contains first symbol in each byte, mm5 the second 594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # each invocation of this macro does 8 butterflies in parallel 614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .MACRO butterfly GROUP 624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # compute branch metrics 634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq Branchtab27_sse+(8*\GROUP),%mm4 644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq Branchtab27_sse+32+(8*\GROUP),%mm3 654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %mm6,%mm4 664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %mm5,%mm3 674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pavgb %mm3,%mm4 # mm4 contains branch metrics 684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlw $3,%mm4 694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pand %mm7,%mm4 704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq ((8*\GROUP)+32)(%esi),%mm3 # Incoming path metric, high bit = 1 734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm0,%mm2 744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm3,%mm1 754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddusb %mm4,%mm0 764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddusb %mm4,%mm3 774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # invert branch metrics. This works only because they're 5 bits 794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %mm7,%mm4 804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddusb %mm4,%mm1 824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddusb %mm4,%mm2 834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Find survivors, leave in mm0,2 854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %mm1,%mm0 864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %mm3,%mm2 874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # get decisions, leave in mm1,3 884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pcmpeqb %mm0,%mm1 894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pcmpeqb %mm2,%mm3 904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # interleave and store new branch metrics in mm0,2 924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm0,%mm4 934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpckhbw %mm2,%mm0 # interleave second 8 new metrics 944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %mm2,%mm4 # interleave first 8 new metrics 954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm0,(16*\GROUP+8)(%edi) 964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm4,(16*\GROUP)(%edi) 974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # interleave decisions, accumulate into %ebx 994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm1,%mm4 1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpckhbw %mm3,%mm1 1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %mm3,%mm4 1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Due to an error in the Intel instruction set ref (the register 1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # fields are swapped), gas assembles pmovmskb incorrectly 1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html 1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax 1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi shll $((16*\GROUP+8)&31),%eax 1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi orl %eax,%ebx 1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax 1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi shll $((16*\GROUP)&31),%eax 1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi orl %eax,%ebx 1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .endm 1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # invoke macro 4 times for a total of 32 butterflies 1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi xorl %ebx,%ebx # clear decisions 1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=0 1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=1 1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %ebx,(%edx) # stash first 32 decisions 1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi xorl %ebx,%ebx 1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=2 1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=3 1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %ebx,4(%edx) # stash second 32 decisions 1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi addl $8,%edx # bump decision pointer 1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # see if we have to normalize 1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl (%edi),%eax # extract first output metric 1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi andl $255,%eax 1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi cmpl $150,%eax # is it greater than 150? 1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl $0,%eax 1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jle done # No, no need to normalize 1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Normalize by finding smallest metric and subtracting it 1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # from all metrics 1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq (%edi),%mm0 1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub 8(%edi),%mm0 1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub 16(%edi),%mm0 1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub 24(%edi),%mm0 1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub 32(%edi),%mm0 1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub 40(%edi),%mm0 1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub 48(%edi),%mm0 1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub 56(%edi),%mm0 1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # mm0 contains 8 smallest metrics 1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # crunch down to single lowest metric 1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm0,%mm1 1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlq $32,%mm0 1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %mm1,%mm0 1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm0,%mm1 1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlq $16,%mm0 1494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %mm1,%mm0 1504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm0,%mm1 1514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlq $8,%mm0 1524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pminub %mm1,%mm0 1534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %mm0,%mm0 # expand to all 8 bytes 1544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pshufw $0,%mm0,%mm0 1554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # mm0 now contains lowest metric in all 8 bytes 1574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # subtract it from every output metric 1584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Trashes %mm7 1594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .macro PSUBUSBM REG,MEM 1604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq \MEM,%mm7 1614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psubusb \REG,%mm7 1624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm7,\MEM 1634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .endm 1644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi PSUBUSBM %mm0,(%edi) 1664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi PSUBUSBM %mm0,8(%edi) 1674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi PSUBUSBM %mm0,16(%edi) 1684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi PSUBUSBM %mm0,24(%edi) 1694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi PSUBUSBM %mm0,32(%edi) 1704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi PSUBUSBM %mm0,40(%edi) 1714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi PSUBUSBM %mm0,48(%edi) 1724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi PSUBUSBM %mm0,56(%edi) 1734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movd %mm0,%eax 1754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi and $0xff,%eax 1764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yidone: # swap metrics 1784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esi,%eax 1794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edi,%esi 1804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %eax,%edi 1814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jmp 1b 1824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2: emms 1844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 8(%ebp),%ebx # ebx = vp 1854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # stash metric pointers 1864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esi,OLDMETRICS(%ebx) 1874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edi,NEWMETRICS(%ebx) 1884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edx,DP(%ebx) # stash incremented value of vp->dp 1894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi xorl %eax,%eax 1904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yierr: popl %ebx 1914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %edx 1924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %edi 1934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %esi 1944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %ebp 1954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi ret 1974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .data 1994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 2004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 16 2014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yithirtyones: 2024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .byte 31,31,31,31,31,31,31,31 2034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 2044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 2054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 206