14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies
24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   for 64-state (k=7) convolutional code
34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   Copyright 2001 Phil Karn, KA9Q
44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ;
74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi*/
84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# SSE (64-bit integer SIMD) version
104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Requires Pentium III or better
114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# These are offsets into struct v27, defined in viterbi27.h
134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set DP,128
144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set OLDMETRICS,132
154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set NEWMETRICS,136
164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi.text
174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi.global update_viterbi27_blk_sse,Branchtab27_sse
184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.type update_viterbi27_blk_sse,@function
194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 16
204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiupdate_viterbi27_blk_sse:
224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebp
234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esp,%ebp
244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %esi
254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %edi
264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %edx
274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebx
284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 8(%ebp),%edx	# edx = vp
304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	testl %edx,%edx
314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jnz  0f
324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl -1,%eax
334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp  err
344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl DP(%edx),%edx	# edx -> decisions
374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1:	movl 16(%ebp),%eax	# eax = nbits
394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	decl %eax
404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jl   2f			# passed zero, we're done
414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %eax,16(%ebp)
424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %eax,%eax
444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 12(%ebp),%ebx	# %ebx = syms
454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movb (%ebx),%al
464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %eax,%mm6		# mm6[0] = first symbol
474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movb 1(%ebx),%al
484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %eax,%mm5		# mm5[0] = second symbol
494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $2,%ebx
504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,12(%ebp)
514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm5,%mm5
544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq thirtyones,%mm7
554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshufw $0,%mm6,%mm6	# copy low word to upper 3
574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshufw $0,%mm5,%mm5
584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# mm6 now contains first symbol in each byte, mm5 the second
594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# each invocation of this macro does 8 butterflies in parallel
614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.MACRO butterfly GROUP
624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# compute branch metrics
634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq Branchtab27_sse+(8*\GROUP),%mm4
644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq Branchtab27_sse+32+(8*\GROUP),%mm3
654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %mm6,%mm4
664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %mm5,%mm3
674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pavgb %mm3,%mm4			# mm4 contains branch metrics
684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlw $3,%mm4
694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pand %mm7,%mm4
704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq ((8*\GROUP)+32)(%esi),%mm3	# Incoming path metric, high bit = 1
734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm2
744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm3,%mm1
754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %mm4,%mm0
764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %mm4,%mm3
774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# invert branch metrics. This works only because they're 5 bits
794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %mm7,%mm4
804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %mm4,%mm1
824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %mm4,%mm2
834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Find survivors, leave in mm0,2
854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm1,%mm0
864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm3,%mm2
874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# get decisions, leave in mm1,3
884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pcmpeqb %mm0,%mm1
894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pcmpeqb %mm2,%mm3
904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# interleave and store new branch metrics in mm0,2
924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm4
934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,(16*\GROUP+8)(%edi)
964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm4,(16*\GROUP)(%edi)
974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# interleave decisions, accumulate into %ebx
994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm1,%mm4
1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpckhbw %mm3,%mm1
1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm3,%mm4
1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Due to an error in the Intel instruction set ref (the register
1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# fields are swapped), gas assembles pmovmskb incorrectly
1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	shll $((16*\GROUP+8)&31),%eax
1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	orl %eax,%ebx
1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	shll $((16*\GROUP)&31),%eax
1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	orl %eax,%ebx
1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.endm
1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# invoke macro 4 times for a total of 32 butterflies
1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx		# clear decisions
1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=0
1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=1
1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,(%edx)	# stash first 32 decisions
1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx
1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=2
1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=3
1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,4(%edx)	# stash second 32 decisions
1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $8,%edx		# bump decision pointer
1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# see if we have to normalize
1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl (%edi),%eax	# extract first output metric
1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	andl $255,%eax
1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	cmpl $150,%eax		# is it greater than 150?
1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl $0,%eax
1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jle done		# No, no need to normalize
1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Normalize by finding smallest metric and subtracting it
1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# from all metrics
1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq (%edi),%mm0
1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 8(%edi),%mm0
1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 16(%edi),%mm0
1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 24(%edi),%mm0
1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 32(%edi),%mm0
1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 40(%edi),%mm0
1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 48(%edi),%mm0
1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 56(%edi),%mm0
1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# mm0 contains 8 smallest metrics
1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# crunch down to single lowest metric
1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm1
1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $32,%mm0
1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm1,%mm0
1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm1
1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $16,%mm0
1494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm1,%mm0
1504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm1
1514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $8,%mm0
1524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm1,%mm0
1534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm0,%mm0	# expand to all 8 bytes
1544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshufw $0,%mm0,%mm0
1554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# mm0 now contains lowest metric in all 8 bytes
1574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# subtract it from every output metric
1584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Trashes %mm7
1594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.macro PSUBUSBM REG,MEM
1604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq \MEM,%mm7
1614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psubusb \REG,%mm7
1624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm7,\MEM
1634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.endm
1644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,(%edi)
1664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,8(%edi)
1674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,16(%edi)
1684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,24(%edi)
1694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,32(%edi)
1704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,40(%edi)
1714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,48(%edi)
1724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,56(%edi)
1734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %mm0,%eax
1754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	and $0xff,%eax
1764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yidone:	# swap metrics
1784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esi,%eax
1794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edi,%esi
1804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %eax,%edi
1814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp 1b
1824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2:	emms
1844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 8(%ebp),%ebx	# ebx = vp
1854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# stash metric pointers
1864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esi,OLDMETRICS(%ebx)
1874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edi,NEWMETRICS(%ebx)
1884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
1894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %eax,%eax
1904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yierr:	popl %ebx
1914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %edx
1924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %edi
1934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %esi
1944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %ebp
1954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	ret
1974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.data
1994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 16
2014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yithirtyones:
2024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 31,31,31,31,31,31,31,31
2034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
206