14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies
24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   for 64-state (k=7) convolutional code
34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   Copyright 2003 Phil Karn, KA9Q
44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ;
74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi*/
84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# SSE2 (128-bit integer SIMD) version
94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Requires Pentium 4 or better
104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# These are offsets into struct v27, defined in viterbi27.h
124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set DP,128
134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set OLDMETRICS,132
144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set NEWMETRICS,136
154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.text
164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.global update_viterbi27_blk_sse2,Branchtab27_sse2
174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.type update_viterbi27_blk_sse2,@function
184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 16
194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiupdate_viterbi27_blk_sse2:
214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebp
224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esp,%ebp
234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %esi
244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %edi
254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %edx
264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebx
274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 8(%ebp),%edx	# edx = vp
294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	testl %edx,%edx
304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jnz  0f
314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl -1,%eax
324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp  err
334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl DP(%edx),%edx	# edx -> decisions
364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1:	movl 16(%ebp),%eax	# eax = nbits
384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	decl %eax
394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jl   2f			# passed zero, we're done
404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %eax,16(%ebp)
414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %eax,%eax
434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 12(%ebp),%ebx	# ebx = syms
444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movb (%ebx),%al
454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %eax,%xmm6		# xmm6[0] = first symbol
464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movb 1(%ebx),%al
474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %eax,%xmm5		# xmm5[0] = second symbol
484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $2,%ebx
494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,12(%ebp)
504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %xmm6,%xmm6	# xmm6[1] = xmm6[0]
524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %xmm5,%xmm5
534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshuflw $0,%xmm6,%xmm6	# copy low word to low 3
544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshuflw $0,%xmm5,%xmm5
554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklqdq %xmm6,%xmm6  # propagate to all 16
564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklqdq %xmm5,%xmm5
574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# xmm6 now contains first symbol in each byte, xmm5 the second
584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa thirtyones,%xmm7
604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# each invocation of this macro does 16 butterflies in parallel
624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.MACRO butterfly GROUP
634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# compute branch metrics
644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa Branchtab27_sse2+(16*\GROUP),%xmm4
654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3
664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %xmm6,%xmm4
674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %xmm5,%xmm3
684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# compute 5-bit branch metric in xmm4 by adding the individual symbol metrics
704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# This is okay for this
714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# code because the worst-case metric spread (at high Eb/No) is only 120,
724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# well within the range of our unsigned 8-bit path metrics, and even within
734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# the range of signed 8-bit path metrics
744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pavgb %xmm3,%xmm4
754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlw $3,%xmm4
764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pand %xmm7,%xmm4
784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa (16*\GROUP)(%esi),%xmm0	# Incoming path metric, high bit = 0
804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa ((16*\GROUP)+32)(%esi),%xmm3	# Incoming path metric, high bit = 1
814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm0,%xmm2
824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm3,%xmm1
834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %xmm4,%xmm0	# note use of saturating arithmetic
844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %xmm4,%xmm3	# this shouldn't be necessary, but why not?
854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# negate branch metrics
874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %xmm7,%xmm4
884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %xmm4,%xmm1
894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %xmm4,%xmm2
904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Find survivors, leave in mm0,2
924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm1,%xmm0
934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm3,%xmm2
944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# get decisions, leave in mm1,3
954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pcmpeqb %xmm0,%xmm1
964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pcmpeqb %xmm2,%xmm3
974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# interleave and store new branch metrics in mm0,2
994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm0,%xmm4
1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpckhbw %xmm2,%xmm0	# interleave second 16 new metrics
1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %xmm2,%xmm4	# interleave first 16 new metrics
1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm0,(32*\GROUP+16)(%edi)
1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm4,(32*\GROUP)(%edi)
1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# interleave decisions & store
1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm1,%xmm4
1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpckhbw %xmm3,%xmm1
1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %xmm3,%xmm4
1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# work around bug in gas due to Intel doc error
1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 0x66,0x0f,0xd7,0xd9	# pmovmskb %xmm1,%ebx
1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	shll $16,%ebx
1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 0x66,0x0f,0xd7,0xc4	# pmovmskb %xmm4,%eax
1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	orl %eax,%ebx
1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,(4*\GROUP)(%edx)
1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.endm
1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# invoke macro 2 times for a total of 32 butterflies
1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=0
1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=1
1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $8,%edx		# bump decision pointer
1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# See if we have to normalize. This requires an explanation. We don't want
1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# our path metrics to exceed 255 on the *next* iteration. Since the
1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# largest branch metric is 30, that means we don't want any to exceed 225
1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# on *this* iteration. Rather than look them all, we just pick an arbitrary one
1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# (the first) and see if it exceeds 225-120=105, where 120 is the experimentally-
1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# determined worst-case metric spread for this code and branch metrics in the range 0-30.
1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# This is extremely conservative, and empirical testing at a variety of Eb/Nos might
1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# show that a higher threshold could be used without affecting BER performance
1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl (%edi),%eax	# extract first output metric
1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	andl $255,%eax
1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	cmp $105,%eax
1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jle done		# No, no need to normalize
1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Normalize by finding smallest metric and subtracting it
1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# from all metrics. We can't just pick an arbitrary small constant because
1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# the minimum metric might be zero!
1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa (%edi),%xmm0
1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm0,%xmm4
1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa 16(%edi),%xmm1
1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm1,%xmm4
1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa 32(%edi),%xmm2
1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm2,%xmm4
1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa 48(%edi),%xmm3
1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm3,%xmm4
1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# crunch down to single lowest metric
1504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm4,%xmm5
1514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrldq $8,%xmm5     # the count to psrldq is bytes, not bits!
1524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm5,%xmm4
1534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm4,%xmm5
1544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $32,%xmm5
1554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm5,%xmm4
1564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm4,%xmm5
1574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $16,%xmm5
1584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm5,%xmm4
1594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm4,%xmm5
1604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $8,%xmm5
1614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %xmm5,%xmm4	# now in lowest byte of %xmm4
1624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %xmm4,%xmm4	# lowest 2 bytes
1644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshuflw $0,%xmm4,%xmm4  # lowest 8 bytes
1654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklqdq %xmm4,%xmm4	# all 16 bytes
1664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# xmm4 now contains lowest metric in all 16 bytes
1684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# subtract it from every output metric
1694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psubusb %xmm4,%xmm0
1704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psubusb %xmm4,%xmm1
1714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psubusb %xmm4,%xmm2
1724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psubusb %xmm4,%xmm3
1734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm0,(%edi)
1744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm1,16(%edi)
1754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm2,32(%edi)
1764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movdqa %xmm3,48(%edi)
1774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yidone:
1794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# swap metrics
1804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esi,%eax
1814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edi,%esi
1824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %eax,%edi
1834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp 1b
1844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2:	movl 8(%ebp),%ebx	# ebx = vp
1864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# stash metric pointers
1874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esi,OLDMETRICS(%ebx)
1884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edi,NEWMETRICS(%ebx)
1894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
1904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %eax,%eax
1914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yierr:	popl %ebx
1924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %edx
1934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %edi
1944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %esi
1954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %ebp
1964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	ret
1974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.data
1994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 16
2004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yithirtyones:
2024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
203