1/* Intel SIMD SSE implementation of Viterbi ACS butterflies
2   for 256-state (k=9) convolutional code
3   Copyright 2004 Phil Karn, KA9Q
4   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
5
6   void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits);
7*/
8	# SSE (64-bit integer SIMD) version
9	# Requires Pentium III or better
10	# These are offsets into struct v29, defined in viterbi29.h
11	.set DP,512
12	.set OLDMETRICS,516
13	.set NEWMETRICS,520
14	.text
15	.global update_viterbi29_blk_sse,Branchtab29_sse
16	.type update_viterbi29_blk_sse,@function
17	.align 16
18
19update_viterbi29_blk_sse:
20	pushl %ebp
21	movl %esp,%ebp
22	pushl %esi
23	pushl %edi
24	pushl %edx
25	pushl %ebx
26
27	movl 8(%ebp),%edx	# edx = vp
28	testl %edx,%edx
29	jnz  0f
30	movl -1,%eax
31	jmp  err
320:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
33	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
34	movl DP(%edx),%edx	# edx -> decisions
35
361:	movl 16(%ebp),%eax	# eax = nbits
37	decl %eax
38	jl   2f			# passed zero, we're done
39	movl %eax,16(%ebp)
40
41	xorl %eax,%eax
42	movl 12(%ebp),%ebx	# ebx = syms
43	movb (%ebx),%al
44	movd %eax,%mm6		# mm6[0] = first symbol
45	movb 1(%ebx),%al
46	movd %eax,%mm5		# mm5[0] = second symbol
47	addl $2,%ebx
48	movl %ebx,12(%ebp)
49
50	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
51	punpcklbw %mm5,%mm5
52
53	movq thirtyones,%mm7
54	pshufw $0,%mm6,%mm6	# copy low word to upper 3
55	pshufw $0,%mm5,%mm5
56	# mm6 now contains first symbol in each byte, mm5 the second
57
58	# each invocation of this macro does 8 butterflies in parallel
59	.MACRO butterfly GROUP
60	# compute branch metrics
61	movq Branchtab29_sse+(8*\GROUP),%mm4
62	movq Branchtab29_sse+128+(8*\GROUP),%mm3
63	pxor %mm6,%mm4
64	pxor %mm5,%mm3
65	pavgb %mm3,%mm4			# mm4 contains branch metrics
66	psrlw $3,%mm4
67	pand %mm7,%mm4
68
69	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
70	movq ((8*\GROUP)+128)(%esi),%mm3	# Incoming path metric, high bit = 1
71	movq %mm0,%mm2
72	movq %mm3,%mm1
73	paddusb %mm4,%mm0
74	paddusb %mm4,%mm3
75
76	# invert branch metrics. This works only because they're 5 bits
77	pxor %mm7,%mm4
78
79	paddusb %mm4,%mm1
80	paddusb %mm4,%mm2
81
82	# Find survivors, leave in mm0,2
83	pminub %mm1,%mm0
84	pminub %mm3,%mm2
85	# get decisions, leave in mm1,3
86	pcmpeqb %mm0,%mm1
87	pcmpeqb %mm2,%mm3
88
89	# interleave and store new branch metrics in mm0,2
90	movq %mm0,%mm4
91	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
92	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
93	movq %mm0,(16*\GROUP+8)(%edi)
94	movq %mm4,(16*\GROUP)(%edi)
95
96	# interleave decisions, accumulate into %ebx
97	movq %mm1,%mm4
98	punpckhbw %mm3,%mm1
99	punpcklbw %mm3,%mm4
100	# Due to an error in the Intel instruction set ref (the register
101	# fields are swapped), gas assembles pmovmskb incorrectly
102	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
103	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
104	shll $((16*\GROUP+8)&31),%eax
105	orl %eax,%ebx
106	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
107	shll $((16*\GROUP)&31),%eax
108	orl %eax,%ebx
109	.endm
110
111	# invoke macro 16 times for a total of 128 butterflies
112	xorl %ebx,%ebx		# clear decisions
113	butterfly GROUP=0
114	butterfly GROUP=1
115	movl %ebx,(%edx)	# stash first 32 decisions
116	xorl %ebx,%ebx
117	butterfly GROUP=2
118	butterfly GROUP=3
119	movl %ebx,4(%edx)	# stash second 32 decisions
120	xorl %ebx,%ebx		# clear decisions
121	butterfly GROUP=4
122	butterfly GROUP=5
123	movl %ebx,8(%edx)	# stash first 32 decisions
124	xorl %ebx,%ebx
125	butterfly GROUP=6
126	butterfly GROUP=7
127	movl %ebx,12(%edx)	# stash second 32 decisions
128	xorl %ebx,%ebx		# clear decisions
129	butterfly GROUP=8
130	butterfly GROUP=9
131	movl %ebx,16(%edx)	# stash first 32 decisions
132	xorl %ebx,%ebx
133	butterfly GROUP=10
134	butterfly GROUP=11
135	movl %ebx,20(%edx)	# stash second 32 decisions
136	xorl %ebx,%ebx		# clear decisions
137	butterfly GROUP=12
138	butterfly GROUP=13
139	movl %ebx,24(%edx)	# stash first 32 decisions
140	xorl %ebx,%ebx
141	butterfly GROUP=14
142	butterfly GROUP=15
143	movl %ebx,28(%edx)	# stash second 32 decisions
144
145	addl $32,%edx		# bump decision pointer
146
147	# see if we have to normalize
148	movl (%edi),%eax	# extract first output metric
149	andl $255,%eax
150	cmp $50,%eax		# is it greater than 50?
151	movl $0,%eax
152	jle done		# No, no need to normalize
153
154	# Normalize by finding smallest metric and subtracting it
155	# from all metrics
156	movq (%edi),%mm0
157	pminub 8(%edi),%mm0
158	pminub 16(%edi),%mm0
159	pminub 24(%edi),%mm0
160	pminub 32(%edi),%mm0
161	pminub 40(%edi),%mm0
162	pminub 48(%edi),%mm0
163	pminub 56(%edi),%mm0
164	pminub 64(%edi),%mm0
165	pminub 72(%edi),%mm0
166	pminub 80(%edi),%mm0
167	pminub 88(%edi),%mm0
168	pminub 96(%edi),%mm0
169	pminub 104(%edi),%mm0
170	pminub 112(%edi),%mm0
171	pminub 120(%edi),%mm0
172	pminub 128(%edi),%mm0
173	pminub 136(%edi),%mm0
174	pminub 144(%edi),%mm0
175	pminub 152(%edi),%mm0
176	pminub 160(%edi),%mm0
177	pminub 168(%edi),%mm0
178	pminub 176(%edi),%mm0
179	pminub 184(%edi),%mm0
180	pminub 192(%edi),%mm0
181	pminub 200(%edi),%mm0
182	pminub 208(%edi),%mm0
183	pminub 216(%edi),%mm0
184	pminub 224(%edi),%mm0
185	pminub 232(%edi),%mm0
186	pminub 240(%edi),%mm0
187	pminub 248(%edi),%mm0
188	# mm0 contains 8 smallest metrics
189	# crunch down to single lowest metric
190	movq %mm0,%mm1
191	psrlq $32,%mm0
192	pminub %mm1,%mm0
193	movq %mm0,%mm1
194	psrlq $16,%mm0
195	pminub %mm1,%mm0
196	movq %mm0,%mm1
197	psrlq $8,%mm0
198	pminub %mm1,%mm0
199	movq 8(%edi),%mm1	# reload
200	punpcklbw %mm0,%mm0	# expand to all 8 bytes
201	pshufw $0,%mm0,%mm0
202
203	# mm0 now contains lowest metric in all 8 bytes
204	# subtract it from every output metric
205	# Trashes %mm7
206	.macro PSUBUSBM REG,MEM
207	movq \MEM,%mm7
208	psubusb \REG,%mm7
209	movq %mm7,\MEM
210	.endm
211
212	PSUBUSBM %mm0,(%edi)
213	PSUBUSBM %mm0,8(%edi)
214	PSUBUSBM %mm0,16(%edi)
215	PSUBUSBM %mm0,24(%edi)
216	PSUBUSBM %mm0,32(%edi)
217	PSUBUSBM %mm0,40(%edi)
218	PSUBUSBM %mm0,48(%edi)
219	PSUBUSBM %mm0,56(%edi)
220	PSUBUSBM %mm0,64(%edi)
221	PSUBUSBM %mm0,72(%edi)
222	PSUBUSBM %mm0,80(%edi)
223	PSUBUSBM %mm0,88(%edi)
224	PSUBUSBM %mm0,96(%edi)
225	PSUBUSBM %mm0,104(%edi)
226	PSUBUSBM %mm0,112(%edi)
227	PSUBUSBM %mm0,120(%edi)
228	PSUBUSBM %mm0,128(%edi)
229	PSUBUSBM %mm0,136(%edi)
230	PSUBUSBM %mm0,144(%edi)
231	PSUBUSBM %mm0,152(%edi)
232	PSUBUSBM %mm0,160(%edi)
233	PSUBUSBM %mm0,168(%edi)
234	PSUBUSBM %mm0,176(%edi)
235	PSUBUSBM %mm0,184(%edi)
236	PSUBUSBM %mm0,192(%edi)
237	PSUBUSBM %mm0,200(%edi)
238	PSUBUSBM %mm0,208(%edi)
239	PSUBUSBM %mm0,216(%edi)
240	PSUBUSBM %mm0,224(%edi)
241	PSUBUSBM %mm0,232(%edi)
242	PSUBUSBM %mm0,240(%edi)
243	PSUBUSBM %mm0,248(%edi)
244
245done:
246	# swap metrics
247	movl %esi,%eax
248	movl %edi,%esi
249	movl %eax,%edi
250	jmp 1b
251
2522:	emms
253	movl 8(%ebp),%ebx	# ebx = vp
254	# stash metric pointers
255	movl %esi,OLDMETRICS(%ebx)
256	movl %edi,NEWMETRICS(%ebx)
257	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
258	xorl %eax,%eax
259err:	popl %ebx
260	popl %edx
261	popl %edi
262	popl %esi
263	popl %ebp
264	ret
265
266	.data
267	.align 8
268thirtyones:
269	.byte 31,31,31,31,31,31,31,31
270
271
272