lpc_asm.s revision c74663799493f2b1e6123c18def94295d0afab7b
1#  libFLAC - Free Lossless Audio Codec library
2#  Copyright (C) 2004,2005,2006,2007  Josh Coalson
3#
4#  Redistribution and use in source and binary forms, with or without
5#  modification, are permitted provided that the following conditions
6#  are met:
7#
8#  - Redistributions of source code must retain the above copyright
9#  notice, this list of conditions and the following disclaimer.
10#
11#  - Redistributions in binary form must reproduce the above copyright
12#  notice, this list of conditions and the following disclaimer in the
13#  documentation and/or other materials provided with the distribution.
14#
15#  - Neither the name of the Xiph.org Foundation nor the names of its
16#  contributors may be used to endorse or promote products derived from
17#  this software without specific prior written permission.
18#
19#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20#  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22#  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
23#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31.text
32	.align 2
33.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
34.type _FLAC__lpc_restore_signal_asm_ppc_altivec_16, @function
35
36.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
37.type _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8, @function
38
39_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
40#	r3: residual[]
41#	r4: data_len
42#	r5: qlp_coeff[]
43#	r6: order
44#	r7: lp_quantization
45#	r8: data[]
46
47# see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
48# these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
49# bps<=15 for mid-side coding, since that uses an extra bit)
50
51# these should be fast; the inner loop is unrolled (it takes no more than
52# 3*(order%4) instructions, all of which are arithmetic), and all of the
53# coefficients and all relevant history stay in registers, so the outer loop
54# has only one load from memory (the residual)
55
56# I have not yet run this through simg4, so there may be some avoidable stalls,
57# and there may be a somewhat more clever way to do the outer loop
58
59# the branch mechanism may prevent dynamic loading; I still need to examine
60# this issue, and there may be a more elegant method
61
62	stmw r31,-4(r1)
63
64	addi r9,r1,-28
65	li r31,0xf
66	andc r9,r9,r31 # for quadword-aligned stack data
67
68	slwi r6,r6,2 # adjust for word size
69	slwi r4,r4,2
70	add r4,r4,r8 # r4 = data+data_len
71
72	mfspr r0,256 # cache old vrsave
73	addis r31,0,0xffff
74	ori r31,r31,0xfc00
75	mtspr 256,r31 # declare VRs in vrsave
76
77	cmplw cr0,r8,r4 # i<data_len
78	bc 4,0,L1400
79
80	# load coefficients into v0-v7 and initial history into v8-v15
81	li r31,0xf
82	and r31,r8,r31 # r31: data%4
83	li r11,16
84	subf r31,r31,r11 # r31: 4-(data%4)
85	slwi r31,r31,3 # convert to bits for vsro
86	li r10,-4
87	stw r31,-4(r9)
88	lvewx v0,r10,r9
89	vspltisb v18,-1
90	vsro v18,v18,v0 # v18: mask vector
91
92	li r31,0x8
93	lvsl v0,0,r31
94	vsldoi v0,v0,v0,12
95	li r31,0xc
96	lvsl v1,0,r31
97	vspltisb v2,0
98	vspltisb v3,-1
99	vmrglw v2,v2,v3
100	vsel v0,v1,v0,v2 # v0: reversal permutation vector
101
102	add r10,r5,r6
103	lvsl v17,0,r5 # v17: coefficient alignment permutation vector
104	vperm v17,v17,v17,v0 # v17: reversal coefficient alignment permutation vector
105
106	mr r11,r8
107	lvsl v16,0,r11 # v16: history alignment permutation vector
108
109	lvx v0,0,r5
110	addi r5,r5,16
111	lvx v1,0,r5
112	vperm v0,v0,v1,v17
113	lvx v8,0,r11
114	addi r11,r11,-16
115	lvx v9,0,r11
116	vperm v8,v9,v8,v16
117	cmplw cr0,r5,r10
118	bc 12,0,L1101
119	vand v0,v0,v18
120	addis r31,0,L1307@ha
121	ori r31,r31,L1307@l
122	b L1199
123
124L1101:
125	addi r5,r5,16
126	lvx v2,0,r5
127	vperm v1,v1,v2,v17
128	addi r11,r11,-16
129	lvx v10,0,r11
130	vperm v9,v10,v9,v16
131	cmplw cr0,r5,r10
132	bc 12,0,L1102
133	vand v1,v1,v18
134	addis r31,0,L1306@ha
135	ori r31,r31,L1306@l
136	b L1199
137
138L1102:
139	addi r5,r5,16
140	lvx v3,0,r5
141	vperm v2,v2,v3,v17
142	addi r11,r11,-16
143	lvx v11,0,r11
144	vperm v10,v11,v10,v16
145	cmplw cr0,r5,r10
146	bc 12,0,L1103
147	vand v2,v2,v18
148	lis r31,L1305@ha
149	la r31,L1305@l(r31)
150	b L1199
151
152L1103:
153	addi r5,r5,16
154	lvx v4,0,r5
155	vperm v3,v3,v4,v17
156	addi r11,r11,-16
157	lvx v12,0,r11
158	vperm v11,v12,v11,v16
159	cmplw cr0,r5,r10
160	bc 12,0,L1104
161	vand v3,v3,v18
162	lis r31,L1304@ha
163	la r31,L1304@l(r31)
164	b L1199
165
166L1104:
167	addi r5,r5,16
168	lvx v5,0,r5
169	vperm v4,v4,v5,v17
170	addi r11,r11,-16
171	lvx v13,0,r11
172	vperm v12,v13,v12,v16
173	cmplw cr0,r5,r10
174	bc 12,0,L1105
175	vand v4,v4,v18
176	lis r31,L1303@ha
177	la r31,L1303@l(r31)
178	b L1199
179
180L1105:
181	addi r5,r5,16
182	lvx v6,0,r5
183	vperm v5,v5,v6,v17
184	addi r11,r11,-16
185	lvx v14,0,r11
186	vperm v13,v14,v13,v16
187	cmplw cr0,r5,r10
188	bc 12,0,L1106
189	vand v5,v5,v18
190	lis r31,L1302@ha
191	la r31,L1302@l(r31)
192	b L1199
193
194L1106:
195	addi r5,r5,16
196	lvx v7,0,r5
197	vperm v6,v6,v7,v17
198	addi r11,r11,-16
199	lvx v15,0,r11
200	vperm v14,v15,v14,v16
201	cmplw cr0,r5,r10
202	bc 12,0,L1107
203	vand v6,v6,v18
204	lis r31,L1301@ha
205	la r31,L1301@l(r31)
206	b L1199
207
208L1107:
209	addi r5,r5,16
210	lvx v19,0,r5
211	vperm v7,v7,v19,v17
212	addi r11,r11,-16
213	lvx v19,0,r11
214	vperm v15,v19,v15,v16
215	vand v7,v7,v18
216	lis r31,L1300@ha
217	la r31,L1300@l(r31)
218
219L1199:
220	mtctr r31
221
222	# set up invariant vectors
223	vspltish v16,0 # v16: zero vector
224
225	li r10,-12
226	lvsr v17,r10,r8 # v17: result shift vector
227	lvsl v18,r10,r3 # v18: residual shift back vector
228
229	li r10,-4
230	stw r7,-4(r9)
231	lvewx v19,r10,r9 # v19: lp_quantization vector
232
233L1200:
234	vmulosh v20,v0,v8 # v20: sum vector
235	bcctr 20,0
236
237L1300:
238	vmulosh v21,v7,v15
239	vsldoi v15,v15,v14,4 # increment history
240	vaddsws v20,v20,v21
241
242L1301:
243	vmulosh v21,v6,v14
244	vsldoi v14,v14,v13,4
245	vaddsws v20,v20,v21
246
247L1302:
248	vmulosh v21,v5,v13
249	vsldoi v13,v13,v12,4
250	vaddsws v20,v20,v21
251
252L1303:
253	vmulosh v21,v4,v12
254	vsldoi v12,v12,v11,4
255	vaddsws v20,v20,v21
256
257L1304:
258	vmulosh v21,v3,v11
259	vsldoi v11,v11,v10,4
260	vaddsws v20,v20,v21
261
262L1305:
263	vmulosh v21,v2,v10
264	vsldoi v10,v10,v9,4
265	vaddsws v20,v20,v21
266
267L1306:
268	vmulosh v21,v1,v9
269	vsldoi v9,v9,v8,4
270	vaddsws v20,v20,v21
271
272L1307:
273	vsumsws v20,v20,v16 # v20[3]: sum
274	vsraw v20,v20,v19 # v20[3]: sum >> lp_quantization
275
276	lvewx v21,0,r3 # v21[n]: *residual
277	vperm v21,v21,v21,v18 # v21[3]: *residual
278	vaddsws v20,v21,v20 # v20[3]: *residual + (sum >> lp_quantization)
279	vsldoi v18,v18,v18,4 # increment shift vector
280
281	vperm v21,v20,v20,v17 # v21[n]: shift for storage
282	vsldoi v17,v17,v17,12 # increment shift vector
283	stvewx v21,0,r8
284
285	vsldoi v20,v20,v20,12
286	vsldoi v8,v8,v20,4 # insert value onto history
287
288	addi r3,r3,4
289	addi r8,r8,4
290	cmplw cr0,r8,r4 # i<data_len
291	bc 12,0,L1200
292
293L1400:
294	mtspr 256,r0 # restore old vrsave
295	lmw r31,-4(r1)
296	blr
297
298_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
299#	r3: residual[]
300#	r4: data_len
301#	r5: qlp_coeff[]
302#	r6: order
303#	r7: lp_quantization
304#	r8: data[]
305
306# see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
307# this version assumes order<=8; it uses fewer vector registers, which should
308# save time in context switches, and has less code, which may improve
309# instruction caching
310
311	stmw r31,-4(r1)
312
313	addi r9,r1,-28
314	li r31,0xf
315	andc r9,r9,r31 # for quadword-aligned stack data
316
317	slwi r6,r6,2 # adjust for word size
318	slwi r4,r4,2
319	add r4,r4,r8 # r4 = data+data_len
320
321	mfspr r0,256 # cache old vrsave
322	addis r31,0,0xffc0
323	ori r31,r31,0x0000
324	mtspr 256,r31 # declare VRs in vrsave
325
326	cmplw cr0,r8,r4 # i<data_len
327	bc 4,0,L2400
328
329	# load coefficients into v0-v1 and initial history into v2-v3
330	li r31,0xf
331	and r31,r8,r31 # r31: data%4
332	li r11,16
333	subf r31,r31,r11 # r31: 4-(data%4)
334	slwi r31,r31,3 # convert to bits for vsro
335	li r10,-4
336	stw r31,-4(r9)
337	lvewx v0,r10,r9
338	vspltisb v6,-1
339	vsro v6,v6,v0 # v6: mask vector
340
341	li r31,0x8
342	lvsl v0,0,r31
343	vsldoi v0,v0,v0,12
344	li r31,0xc
345	lvsl v1,0,r31
346	vspltisb v2,0
347	vspltisb v3,-1
348	vmrglw v2,v2,v3
349	vsel v0,v1,v0,v2 # v0: reversal permutation vector
350
351	add r10,r5,r6
352	lvsl v5,0,r5 # v5: coefficient alignment permutation vector
353	vperm v5,v5,v5,v0 # v5: reversal coefficient alignment permutation vector
354
355	mr r11,r8
356	lvsl v4,0,r11 # v4: history alignment permutation vector
357
358	lvx v0,0,r5
359	addi r5,r5,16
360	lvx v1,0,r5
361	vperm v0,v0,v1,v5
362	lvx v2,0,r11
363	addi r11,r11,-16
364	lvx v3,0,r11
365	vperm v2,v3,v2,v4
366	cmplw cr0,r5,r10
367	bc 12,0,L2101
368	vand v0,v0,v6
369	lis r31,L2301@ha
370	la r31,L2301@l(r31)
371	b L2199
372
373L2101:
374	addi r5,r5,16
375	lvx v7,0,r5
376	vperm v1,v1,v7,v5
377	addi r11,r11,-16
378	lvx v7,0,r11
379	vperm v3,v7,v3,v4
380	vand v1,v1,v6
381	lis r31,L2300@ha
382	la r31,L2300@l(r31)
383
384L2199:
385	mtctr r31
386
387	# set up invariant vectors
388	vspltish v4,0 # v4: zero vector
389
390	li r10,-12
391	lvsr v5,r10,r8 # v5: result shift vector
392	lvsl v6,r10,r3 # v6: residual shift back vector
393
394	li r10,-4
395	stw r7,-4(r9)
396	lvewx v7,r10,r9 # v7: lp_quantization vector
397
398L2200:
399	vmulosh v8,v0,v2 # v8: sum vector
400	bcctr 20,0
401
402L2300:
403	vmulosh v9,v1,v3
404	vsldoi v3,v3,v2,4
405	vaddsws v8,v8,v9
406
407L2301:
408	vsumsws v8,v8,v4 # v8[3]: sum
409	vsraw v8,v8,v7 # v8[3]: sum >> lp_quantization
410
411	lvewx v9,0,r3 # v9[n]: *residual
412	vperm v9,v9,v9,v6 # v9[3]: *residual
413	vaddsws v8,v9,v8 # v8[3]: *residual + (sum >> lp_quantization)
414	vsldoi v6,v6,v6,4 # increment shift vector
415
416	vperm v9,v8,v8,v5 # v9[n]: shift for storage
417	vsldoi v5,v5,v5,12 # increment shift vector
418	stvewx v9,0,r8
419
420	vsldoi v8,v8,v8,12
421	vsldoi v2,v2,v8,4 # insert value onto history
422
423	addi r3,r3,4
424	addi r8,r8,4
425	cmplw cr0,r8,r4 # i<data_len
426	bc 12,0,L2200
427
428L2400:
429	mtspr 256,r0 # restore old vrsave
430	lmw r31,-4(r1)
431	blr
432