14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Intel SIMD MMX implementation of Viterbi ACS butterflies
24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   for 64-state (k=7) convolutional code
34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   Copyright 2004 Phil Karn, KA9Q
44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ;
74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi*/
84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# MMX (64-bit SIMD) version
94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# requires Pentium-MMX, Pentium-II or better
104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# These are offsets into struct v27, defined in viterbi27_mmx.c
124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set DP,128
134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set OLDMETRICS,132
144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set NEWMETRICS,136
154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.text
164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2
174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.type update_viterbi27_blk_mmx,@function
184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 16
194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiupdate_viterbi27_blk_mmx:
214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebp
224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esp,%ebp
234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %esi
244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %edi
254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %edx
264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebx
274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 8(%ebp),%edx	# edx = vp
294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	testl %edx,%edx
304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jnz  0f
314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl -1,%eax
324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp  err
334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl DP(%edx),%edx	# edx -> decisions
364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1:	movl 16(%ebp),%eax	# eax = nbits
384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	decl %eax
394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jl   2f			# passed zero, we're done
404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %eax,16(%ebp)
414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 12(%ebp),%ebx	# ebx = syms
434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movw (%ebx),%ax		# ax = second symbol : first symbol
444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $2,%ebx
454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,12(%ebp)
464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movb %ah,%bl
484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	andl $255,%eax
494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	andl $255,%ebx
504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# shift into first array index dimension slot
524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	shll $5,%eax
534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	shll $5,%ebx
544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# each invocation of this macro will do 8 butterflies in parallel
564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.MACRO butterfly GROUP
574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Compute branch metrics
584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq (Mettab27_1+8*\GROUP)(%eax),%mm3
594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq fifteens,%mm0
604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3
624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddb ones,%mm3  # emulate pavgb - this may not be necessary
634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $1,%mm3
644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pand %mm0,%mm3
654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq (8*\GROUP)(%esi),%mm6	# Incoming path metric, high bit = 0
674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1
684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm6,%mm1
694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm2,%mm7
704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddb %mm3,%mm6
724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddb %mm3,%mm2
734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor  %mm0,%mm3		 # invert branch metric
744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddb %mm3,%mm7		 # path metric for inverted symbols
754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddb %mm3,%mm1
764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# live registers 1 2 6 7
784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Compare mm6 and mm7;  mm1 and mm2
794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %mm3,%mm3
804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm6,%mm4
814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm1,%mm5
824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psubb %mm7,%mm4		# mm4 = mm6 - mm7
834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psubb %mm2,%mm5		# mm5 = mm1 - mm2
844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pcmpgtb %mm3,%mm4	# mm4 = first set of decisions (ff = 1 better)
854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pcmpgtb %mm3,%mm5	# mm5 = second set of decisions
864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# live registers 1 2 4 5 6 7
884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# select survivors
894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm4,%mm0
904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pand %mm4,%mm7
914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm5,%mm3
924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pand %mm5,%mm2
934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pandn %mm6,%mm0
944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pandn %mm1,%mm3
954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	por %mm0,%mm7		# mm7 = first set of survivors
964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	por %mm3,%mm2		# mm2 = second set of survivors
974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# live registers 2 4 5 7
994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# interleave & store decisions in mm4, mm5
1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# interleave & store new branch metrics in mm2, mm7
1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm4,%mm3
1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm7,%mm0
1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpckhbw %mm5,%mm4
1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm5,%mm3
1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm2,%mm7	# interleave second 8 new metrics
1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpckhbw %mm2,%mm0	# interleave first 8 new metrics
1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm4,(16*\GROUP+8)(%edx)
1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm3,(16*\GROUP)(%edx)
1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm7,(16*\GROUP)(%edi)
1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,(16*\GROUP+8)(%edi)
1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.endm
1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# invoke macro 4 times for a total of 32 butterflies
1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=0
1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=1
1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=2
1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=3
1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $64,%edx		# bump decision pointer
1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# swap metrics
1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esi,%eax
1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edi,%esi
1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %eax,%edi
1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp 1b
1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2:	emms
1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 8(%ebp),%ebx	# ebx = vp
1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# stash metric pointers
1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esi,OLDMETRICS(%ebx)
1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edi,NEWMETRICS(%ebx)
1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %eax,%eax
1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yierr:	popl %ebx
1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %edx
1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %edi
1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %esi
1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %ebp
1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	ret
1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.data
1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 8
1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yififteens:
1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 15,15,15,15,15,15,15,15
1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 8
1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiones:	.byte 1,1,1,1,1,1,1,1
149