14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Intel SIMD MMX implementation of Viterbi ACS butterflies 24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi for 64-state (k=7) convolutional code 34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi Copyright 2004 Phil Karn, KA9Q 44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi This code may be used under the terms of the GNU Lesser General Public License (LGPL) 54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ; 74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi*/ 84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # MMX (64-bit SIMD) version 94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # requires Pentium-MMX, Pentium-II or better 104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # These are offsets into struct v27, defined in viterbi27_mmx.c 124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set DP,128 134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set OLDMETRICS,132 144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .set NEWMETRICS,136 154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .text 164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2 174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .type update_viterbi27_blk_mmx,@function 184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 16 194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiupdate_viterbi27_blk_mmx: 214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %ebp 224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esp,%ebp 234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %esi 244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %edi 254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %edx 264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %ebx 274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 8(%ebp),%edx # edx = vp 294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi testl %edx,%edx 304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jnz 0f 314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl -1,%eax 324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jmp err 334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi0: movl OLDMETRICS(%edx),%esi # esi -> old metrics 344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl NEWMETRICS(%edx),%edi # edi -> new metrics 354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl DP(%edx),%edx # edx -> decisions 364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1: movl 16(%ebp),%eax # eax = nbits 384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi decl %eax 394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jl 2f # passed zero, we're done 404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %eax,16(%ebp) 414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 12(%ebp),%ebx # ebx = syms 434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movw (%ebx),%ax # ax = second symbol : first symbol 444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi addl $2,%ebx 454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %ebx,12(%ebp) 464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movb %ah,%bl 484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi andl $255,%eax 494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi andl $255,%ebx 504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # shift into first array index dimension slot 524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi shll $5,%eax 534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi shll $5,%ebx 544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # each invocation of this macro will do 8 butterflies in parallel 564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .MACRO butterfly GROUP 574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Compute branch metrics 584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq (Mettab27_1+8*\GROUP)(%eax),%mm3 594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq fifteens,%mm0 604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3 624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddb ones,%mm3 # emulate pavgb - this may not be necessary 634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlq $1,%mm3 644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pand %mm0,%mm3 654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0 674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1 684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm6,%mm1 694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm2,%mm7 704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddb %mm3,%mm6 724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddb %mm3,%mm2 734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %mm0,%mm3 # invert branch metric 744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddb %mm3,%mm7 # path metric for inverted symbols 754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddb %mm3,%mm1 764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # live registers 1 2 6 7 784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # Compare mm6 and mm7; mm1 and mm2 794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %mm3,%mm3 804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm6,%mm4 814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm1,%mm5 824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psubb %mm7,%mm4 # mm4 = mm6 - mm7 834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psubb %mm2,%mm5 # mm5 = mm1 - mm2 844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better) 854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pcmpgtb %mm3,%mm5 # mm5 = second set of decisions 864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # live registers 1 2 4 5 6 7 884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # select survivors 894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm4,%mm0 904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pand %mm4,%mm7 914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm5,%mm3 924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pand %mm5,%mm2 934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pandn %mm6,%mm0 944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pandn %mm1,%mm3 954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi por %mm0,%mm7 # mm7 = first set of survivors 964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi por %mm3,%mm2 # mm2 = second set of survivors 974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # live registers 2 4 5 7 994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # interleave & store decisions in mm4, mm5 1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # interleave & store new branch metrics in mm2, mm7 1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm4,%mm3 1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm7,%mm0 1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpckhbw %mm5,%mm4 1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %mm5,%mm3 1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpcklbw %mm2,%mm7 # interleave second 8 new metrics 1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi punpckhbw %mm2,%mm0 # interleave first 8 new metrics 1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm4,(16*\GROUP+8)(%edx) 1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm3,(16*\GROUP)(%edx) 1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm7,(16*\GROUP)(%edi) 1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movq %mm0,(16*\GROUP+8)(%edi) 1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .endm 1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# invoke macro 4 times for a total of 32 butterflies 1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=0 1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=1 1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=2 1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi butterfly GROUP=3 1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi addl $64,%edx # bump decision pointer 1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # swap metrics 1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esi,%eax 1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edi,%esi 1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %eax,%edi 1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jmp 1b 1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2: emms 1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 8(%ebp),%ebx # ebx = vp 1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi # stash metric pointers 1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esi,OLDMETRICS(%ebx) 1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edi,NEWMETRICS(%ebx) 1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %edx,DP(%ebx) # stash incremented value of vp->dp 1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi xorl %eax,%eax 1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yierr: popl %ebx 1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %edx 1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %edi 1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %esi 1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %ebp 1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi ret 1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .data 1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 8 1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yififteens: 1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .byte 15,15,15,15,15,15,15,15 1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 8 1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiones: .byte 1,1,1,1,1,1,1,1 149