1/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies
2   for 64-state (k=7) convolutional code
3   Copyright 2001 Phil Karn, KA9Q
4   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
5
6   int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ;
7*/
8
9	# SSE (64-bit integer SIMD) version
10	# Requires Pentium III or better
11
12	# These are offsets into struct v27, defined in viterbi27.h
13	.set DP,128
14	.set OLDMETRICS,132
15	.set NEWMETRICS,136
16.text
17.global update_viterbi27_blk_sse,Branchtab27_sse
18	.type update_viterbi27_blk_sse,@function
19	.align 16
20
21update_viterbi27_blk_sse:
22	pushl %ebp
23	movl %esp,%ebp
24	pushl %esi
25	pushl %edi
26	pushl %edx
27	pushl %ebx
28
29	movl 8(%ebp),%edx	# edx = vp
30	testl %edx,%edx
31	jnz  0f
32	movl -1,%eax
33	jmp  err
340:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
35	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
36	movl DP(%edx),%edx	# edx -> decisions
37
381:	movl 16(%ebp),%eax	# eax = nbits
39	decl %eax
40	jl   2f			# passed zero, we're done
41	movl %eax,16(%ebp)
42
43	xorl %eax,%eax
44	movl 12(%ebp),%ebx	# %ebx = syms
45	movb (%ebx),%al
46	movd %eax,%mm6		# mm6[0] = first symbol
47	movb 1(%ebx),%al
48	movd %eax,%mm5		# mm5[0] = second symbol
49	addl $2,%ebx
50	movl %ebx,12(%ebp)
51
52	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
53	punpcklbw %mm5,%mm5
54	movq thirtyones,%mm7
55
56	pshufw $0,%mm6,%mm6	# copy low word to upper 3
57	pshufw $0,%mm5,%mm5
58	# mm6 now contains first symbol in each byte, mm5 the second
59
60	# each invocation of this macro does 8 butterflies in parallel
61	.MACRO butterfly GROUP
62	# compute branch metrics
63	movq Branchtab27_sse+(8*\GROUP),%mm4
64	movq Branchtab27_sse+32+(8*\GROUP),%mm3
65	pxor %mm6,%mm4
66	pxor %mm5,%mm3
67	pavgb %mm3,%mm4			# mm4 contains branch metrics
68	psrlw $3,%mm4
69	pand %mm7,%mm4
70
71	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
72	movq ((8*\GROUP)+32)(%esi),%mm3	# Incoming path metric, high bit = 1
73	movq %mm0,%mm2
74	movq %mm3,%mm1
75	paddusb %mm4,%mm0
76	paddusb %mm4,%mm3
77
78	# invert branch metrics. This works only because they're 5 bits
79	pxor %mm7,%mm4
80
81	paddusb %mm4,%mm1
82	paddusb %mm4,%mm2
83
84	# Find survivors, leave in mm0,2
85	pminub %mm1,%mm0
86	pminub %mm3,%mm2
87	# get decisions, leave in mm1,3
88	pcmpeqb %mm0,%mm1
89	pcmpeqb %mm2,%mm3
90
91	# interleave and store new branch metrics in mm0,2
92	movq %mm0,%mm4
93	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
94	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
95	movq %mm0,(16*\GROUP+8)(%edi)
96	movq %mm4,(16*\GROUP)(%edi)
97
98	# interleave decisions, accumulate into %ebx
99	movq %mm1,%mm4
100	punpckhbw %mm3,%mm1
101	punpcklbw %mm3,%mm4
102	# Due to an error in the Intel instruction set ref (the register
103	# fields are swapped), gas assembles pmovmskb incorrectly
104	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
105	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
106	shll $((16*\GROUP+8)&31),%eax
107	orl %eax,%ebx
108	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
109	shll $((16*\GROUP)&31),%eax
110	orl %eax,%ebx
111	.endm
112
113	# invoke macro 4 times for a total of 32 butterflies
114	xorl %ebx,%ebx		# clear decisions
115	butterfly GROUP=0
116	butterfly GROUP=1
117	movl %ebx,(%edx)	# stash first 32 decisions
118	xorl %ebx,%ebx
119	butterfly GROUP=2
120	butterfly GROUP=3
121	movl %ebx,4(%edx)	# stash second 32 decisions
122
123	addl $8,%edx		# bump decision pointer
124
125	# see if we have to normalize
126	movl (%edi),%eax	# extract first output metric
127	andl $255,%eax
128	cmpl $150,%eax		# is it greater than 150?
129	movl $0,%eax
130	jle done		# No, no need to normalize
131
132	# Normalize by finding smallest metric and subtracting it
133	# from all metrics
134	movq (%edi),%mm0
135	pminub 8(%edi),%mm0
136	pminub 16(%edi),%mm0
137	pminub 24(%edi),%mm0
138	pminub 32(%edi),%mm0
139	pminub 40(%edi),%mm0
140	pminub 48(%edi),%mm0
141	pminub 56(%edi),%mm0
142	# mm0 contains 8 smallest metrics
143	# crunch down to single lowest metric
144	movq %mm0,%mm1
145	psrlq $32,%mm0
146	pminub %mm1,%mm0
147	movq %mm0,%mm1
148	psrlq $16,%mm0
149	pminub %mm1,%mm0
150	movq %mm0,%mm1
151	psrlq $8,%mm0
152	pminub %mm1,%mm0
153	punpcklbw %mm0,%mm0	# expand to all 8 bytes
154	pshufw $0,%mm0,%mm0
155
156	# mm0 now contains lowest metric in all 8 bytes
157	# subtract it from every output metric
158	# Trashes %mm7
159	.macro PSUBUSBM REG,MEM
160	movq \MEM,%mm7
161	psubusb \REG,%mm7
162	movq %mm7,\MEM
163	.endm
164
165	PSUBUSBM %mm0,(%edi)
166	PSUBUSBM %mm0,8(%edi)
167	PSUBUSBM %mm0,16(%edi)
168	PSUBUSBM %mm0,24(%edi)
169	PSUBUSBM %mm0,32(%edi)
170	PSUBUSBM %mm0,40(%edi)
171	PSUBUSBM %mm0,48(%edi)
172	PSUBUSBM %mm0,56(%edi)
173
174	movd %mm0,%eax
175	and $0xff,%eax
176
177done:	# swap metrics
178	movl %esi,%eax
179	movl %edi,%esi
180	movl %eax,%edi
181	jmp 1b
182
1832:	emms
184	movl 8(%ebp),%ebx	# ebx = vp
185	# stash metric pointers
186	movl %esi,OLDMETRICS(%ebx)
187	movl %edi,NEWMETRICS(%ebx)
188	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
189	xorl %eax,%eax
190err:	popl %ebx
191	popl %edx
192	popl %edi
193	popl %esi
194	popl %ebp
195
196	ret
197
198	.data
199
200	.align 16
201thirtyones:
202	.byte 31,31,31,31,31,31,31,31
203
204
205
206