14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi/* Intel SIMD SSE implementation of Viterbi ACS butterflies
24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   for 256-state (k=9) convolutional code
34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   Copyright 2004 Phil Karn, KA9Q
44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi   void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits);
74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi*/
84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# SSE (64-bit integer SIMD) version
94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Requires Pentium III or better
104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# These are offsets into struct v29, defined in viterbi29.h
114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set DP,512
124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set OLDMETRICS,516
134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.set NEWMETRICS,520
144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.text
154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.global update_viterbi29_blk_sse,Branchtab29_sse
164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.type update_viterbi29_blk_sse,@function
174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 16
184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yiupdate_viterbi29_blk_sse:
204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebp
214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esp,%ebp
224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %esi
234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %edi
244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %edx
254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebx
264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 8(%ebp),%edx	# edx = vp
284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	testl %edx,%edx
294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jnz  0f
304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl -1,%eax
314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp  err
324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl DP(%edx),%edx	# edx -> decisions
354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1:	movl 16(%ebp),%eax	# eax = nbits
374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	decl %eax
384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jl   2f			# passed zero, we're done
394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %eax,16(%ebp)
404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %eax,%eax
424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 12(%ebp),%ebx	# ebx = syms
434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movb (%ebx),%al
444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %eax,%mm6		# mm6[0] = first symbol
454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movb 1(%ebx),%al
464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %eax,%mm5		# mm5[0] = second symbol
474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $2,%ebx
484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,12(%ebp)
494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm5,%mm5
524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq thirtyones,%mm7
544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshufw $0,%mm6,%mm6	# copy low word to upper 3
554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshufw $0,%mm5,%mm5
564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# mm6 now contains first symbol in each byte, mm5 the second
574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# each invocation of this macro does 8 butterflies in parallel
594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.MACRO butterfly GROUP
604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# compute branch metrics
614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq Branchtab29_sse+(8*\GROUP),%mm4
624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq Branchtab29_sse+128+(8*\GROUP),%mm3
634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %mm6,%mm4
644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %mm5,%mm3
654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pavgb %mm3,%mm4			# mm4 contains branch metrics
664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlw $3,%mm4
674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pand %mm7,%mm4
684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq ((8*\GROUP)+128)(%esi),%mm3	# Incoming path metric, high bit = 1
714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm2
724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm3,%mm1
734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %mm4,%mm0
744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %mm4,%mm3
754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# invert branch metrics. This works only because they're 5 bits
774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %mm7,%mm4
784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %mm4,%mm1
804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddusb %mm4,%mm2
814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Find survivors, leave in mm0,2
834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm1,%mm0
844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm3,%mm2
854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# get decisions, leave in mm1,3
864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pcmpeqb %mm0,%mm1
874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pcmpeqb %mm2,%mm3
884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# interleave and store new branch metrics in mm0,2
904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm4
914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,(16*\GROUP+8)(%edi)
944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm4,(16*\GROUP)(%edi)
954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# interleave decisions, accumulate into %ebx
974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm1,%mm4
984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpckhbw %mm3,%mm1
994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm3,%mm4
1004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Due to an error in the Intel instruction set ref (the register
1014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# fields are swapped), gas assembles pmovmskb incorrectly
1024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
1034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
1044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	shll $((16*\GROUP+8)&31),%eax
1054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	orl %eax,%ebx
1064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
1074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	shll $((16*\GROUP)&31),%eax
1084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	orl %eax,%ebx
1094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.endm
1104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# invoke macro 16 times for a total of 128 butterflies
1124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx		# clear decisions
1134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=0
1144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=1
1154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,(%edx)	# stash first 32 decisions
1164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx
1174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=2
1184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=3
1194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,4(%edx)	# stash second 32 decisions
1204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx		# clear decisions
1214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=4
1224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=5
1234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,8(%edx)	# stash first 32 decisions
1244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx
1254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=6
1264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=7
1274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,12(%edx)	# stash second 32 decisions
1284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx		# clear decisions
1294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=8
1304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=9
1314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,16(%edx)	# stash first 32 decisions
1324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx
1334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=10
1344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=11
1354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,20(%edx)	# stash second 32 decisions
1364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx		# clear decisions
1374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=12
1384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=13
1394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,24(%edx)	# stash first 32 decisions
1404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %ebx,%ebx
1414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=14
1424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	butterfly GROUP=15
1434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %ebx,28(%edx)	# stash second 32 decisions
1444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $32,%edx		# bump decision pointer
1464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# see if we have to normalize
1484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl (%edi),%eax	# extract first output metric
1494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	andl $255,%eax
1504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	cmp $50,%eax		# is it greater than 50?
1514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl $0,%eax
1524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jle done		# No, no need to normalize
1534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
1544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Normalize by finding smallest metric and subtracting it
1554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# from all metrics
1564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq (%edi),%mm0
1574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 8(%edi),%mm0
1584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 16(%edi),%mm0
1594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 24(%edi),%mm0
1604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 32(%edi),%mm0
1614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 40(%edi),%mm0
1624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 48(%edi),%mm0
1634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 56(%edi),%mm0
1644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 64(%edi),%mm0
1654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 72(%edi),%mm0
1664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 80(%edi),%mm0
1674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 88(%edi),%mm0
1684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 96(%edi),%mm0
1694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 104(%edi),%mm0
1704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 112(%edi),%mm0
1714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 120(%edi),%mm0
1724e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 128(%edi),%mm0
1734e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 136(%edi),%mm0
1744e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 144(%edi),%mm0
1754e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 152(%edi),%mm0
1764e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 160(%edi),%mm0
1774e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 168(%edi),%mm0
1784e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 176(%edi),%mm0
1794e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 184(%edi),%mm0
1804e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 192(%edi),%mm0
1814e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 200(%edi),%mm0
1824e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 208(%edi),%mm0
1834e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 216(%edi),%mm0
1844e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 224(%edi),%mm0
1854e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 232(%edi),%mm0
1864e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 240(%edi),%mm0
1874e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub 248(%edi),%mm0
1884e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# mm0 contains 8 smallest metrics
1894e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# crunch down to single lowest metric
1904e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm1
1914e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $32,%mm0
1924e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm1,%mm0
1934e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm1
1944e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $16,%mm0
1954e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm1,%mm0
1964e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm0,%mm1
1974e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $8,%mm0
1984e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pminub %mm1,%mm0
1994e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq 8(%edi),%mm1	# reload
2004e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	punpcklbw %mm0,%mm0	# expand to all 8 bytes
2014e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pshufw $0,%mm0,%mm0
2024e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2034e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# mm0 now contains lowest metric in all 8 bytes
2044e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# subtract it from every output metric
2054e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# Trashes %mm7
2064e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.macro PSUBUSBM REG,MEM
2074e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq \MEM,%mm7
2084e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psubusb \REG,%mm7
2094e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movq %mm7,\MEM
2104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.endm
2114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,(%edi)
2134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,8(%edi)
2144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,16(%edi)
2154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,24(%edi)
2164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,32(%edi)
2174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,40(%edi)
2184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,48(%edi)
2194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,56(%edi)
2204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,64(%edi)
2214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,72(%edi)
2224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,80(%edi)
2234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,88(%edi)
2244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,96(%edi)
2254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,104(%edi)
2264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,112(%edi)
2274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,120(%edi)
2284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,128(%edi)
2294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,136(%edi)
2304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,144(%edi)
2314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,152(%edi)
2324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,160(%edi)
2334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,168(%edi)
2344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,176(%edi)
2354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,184(%edi)
2364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,192(%edi)
2374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,200(%edi)
2384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,208(%edi)
2394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,216(%edi)
2404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,224(%edi)
2414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,232(%edi)
2424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,240(%edi)
2434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	PSUBUSBM %mm0,248(%edi)
2444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yidone:
2464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# swap metrics
2474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esi,%eax
2484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edi,%esi
2494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %eax,%edi
2504e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp 1b
2514e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2524e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2:	emms
2534e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 8(%ebp),%ebx	# ebx = vp
2544e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	# stash metric pointers
2554e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esi,OLDMETRICS(%ebx)
2564e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edi,NEWMETRICS(%ebx)
2574e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
2584e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	xorl %eax,%eax
2594e213d510f437769f8a28578dd4f786fb7d16c4Bill Yierr:	popl %ebx
2604e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %edx
2614e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %edi
2624e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %esi
2634e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %ebp
2644e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	ret
2654e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2664e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.data
2674e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 8
2684e213d510f437769f8a28578dd4f786fb7d16c4Bill Yithirtyones:
2694e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.byte 31,31,31,31,31,31,31,31
2704e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
2714e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
272