jsimd_arm_neon.S revision 8c60d22ff51486afacf772b6f6b8b44630ffbff8
1/*
2 * ARM NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
7 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty.  In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 *    claim that you wrote the original software. If you use this software
18 *    in a product, an acknowledgment in the product documentation would be
19 *    appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 *    misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25#if defined(__linux__) && defined(__ELF__)
26.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
27#endif
28
29.text
30.fpu neon
31.arch armv7a
32.object_arch armv4
33.arm
34
35
36#define RESPECT_STRICT_ALIGNMENT 1
37
38/*****************************************************************************/
39
40/* Supplementary macro for setting function attributes */
41.macro asm_function fname
42#ifdef __APPLE__
43    .func _\fname
44    .globl _\fname
45_\fname:
46#else
47    .func \fname
48    .global \fname
49#ifdef __ELF__
50    .hidden \fname
51    .type \fname, %function
52#endif
53\fname:
54#endif
55.endm
56
57/* Transpose a block of 4x4 coefficients in four 64-bit registers */
58.macro transpose_4x4 x0, x1, x2, x3
59    vtrn.16 \x0, \x1
60    vtrn.16 \x2, \x3
61    vtrn.32 \x0, \x2
62    vtrn.32 \x1, \x3
63.endm
64
65/*****************************************************************************/
66
67/*
68 * jsimd_idct_ifast_neon
69 *
70 * This function contains a fast, not so accurate integer implementation of
71 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
72 * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
73 * function from jidctfst.c
74 *
75 * TODO: a bit better instructions scheduling is needed.
76 */
77
78#define XFIX_1_082392200 d0[0]
79#define XFIX_1_414213562 d0[1]
80#define XFIX_1_847759065 d0[2]
81#define XFIX_2_613125930 d0[3]
82
83.balign 16
84jsimd_idct_ifast_neon_consts:
85    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
86    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
87    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
88    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
89
90/* 1-D IDCT helper macro */
91
92.macro idct_helper  x0, x1, x2, x3, x4, x5, x6, x7, \
93                    t10, t11, t12, t13, t14
94
95    vsub.s16        \t10, \x0, \x4
96    vadd.s16        \x4,  \x0, \x4
97    vswp.s16        \t10, \x0
98    vsub.s16        \t11, \x2, \x6
99    vadd.s16        \x6,  \x2, \x6
100    vswp.s16        \t11, \x2
101    vsub.s16        \t10, \x3, \x5
102    vadd.s16        \x5,  \x3, \x5
103    vswp.s16        \t10, \x3
104    vsub.s16        \t11, \x1, \x7
105    vadd.s16        \x7,  \x1, \x7
106    vswp.s16        \t11, \x1
107
108    vqdmulh.s16     \t13, \x2,  d0[1]
109    vadd.s16        \t12, \x3,  \x3
110    vadd.s16        \x2,  \x2,  \t13
111    vqdmulh.s16     \t13, \x3,  d0[3]
112    vsub.s16        \t10,  \x1, \x3
113    vadd.s16        \t12, \t12, \t13
114    vqdmulh.s16     \t13, \t10, d0[2]
115    vsub.s16        \t11, \x7,  \x5
116    vadd.s16        \t10, \t10, \t13
117    vqdmulh.s16     \t13, \t11, d0[1]
118    vadd.s16        \t11, \t11, \t13
119
120    vqdmulh.s16     \t13, \x1,  d0[0]
121    vsub.s16        \x2,  \x6,  \x2
122    vsub.s16        \t14, \x0,  \x2
123    vadd.s16        \x2,  \x0,  \x2
124    vadd.s16        \x0,  \x4,  \x6
125    vsub.s16        \x4,  \x4,  \x6
126    vadd.s16        \x1,  \x1,  \t13
127    vadd.s16        \t13, \x7,  \x5
128    vsub.s16        \t12, \t13, \t12
129    vsub.s16        \t12, \t12, \t10
130    vadd.s16        \t11, \t12, \t11
131    vsub.s16        \t10, \x1,  \t10
132    vadd.s16        \t10, \t10, \t11
133
134    vsub.s16        \x7,  \x0,  \t13
135    vadd.s16        \x0,  \x0,  \t13
136    vadd.s16        \x6,  \t14, \t12
137    vsub.s16        \x1,  \t14, \t12
138    vsub.s16        \x5,  \x2,  \t11
139    vadd.s16        \x2,  \x2,  \t11
140    vsub.s16        \x3,  \x4,  \t10
141    vadd.s16        \x4,  \x4,  \t10
142.endm
143
144asm_function jsimd_idct_ifast_neon
145
146    DCT_TABLE       .req r0
147    COEF_BLOCK      .req r1
148    OUTPUT_BUF      .req r2
149    OUTPUT_COL      .req r3
150    TMP             .req ip
151
152    vpush           {d8-d15}
153
154    /* Load constants */
155    adr             TMP, jsimd_idct_ifast_neon_consts
156    vld1.16         {d0}, [TMP, :64]
157
158    /* Load all COEF_BLOCK into NEON registers with the following allocation:
159     *       0 1 2 3 | 4 5 6 7
160     *      ---------+--------
161     *   0 | d4      | d5
162     *   1 | d6      | d7
163     *   2 | d8      | d9
164     *   3 | d10     | d11
165     *   4 | d12     | d13
166     *   5 | d14     | d15
167     *   6 | d16     | d17
168     *   7 | d18     | d19
169     */
170    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
171    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]!
172    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]!
173    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK]!
174    /* Dequantize */
175    vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
176    vmul.s16        q2, q2, q10
177    vld1.16         {d24, d25, d26, d27}, [DCT_TABLE]!
178    vmul.s16        q3, q3, q11
179    vmul.s16        q4, q4, q12
180    vld1.16         {d28, d29, d30, d31}, [DCT_TABLE]!
181    vmul.s16        q5, q5, q13
182    vmul.s16        q6, q6, q14
183    vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
184    vmul.s16        q7, q7, q15
185    vmul.s16        q8, q8, q10
186    vmul.s16        q9, q9, q11
187
188    /* Pass 1 */
189    idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
190    /* Transpose */
191    transpose_4x4   d4,  d6,  d8,  d10
192    transpose_4x4   d5,  d7,  d9,  d11
193    transpose_4x4   d12, d14, d16, d18
194    transpose_4x4   d13, d15, d17, d19
195    vswp            d12, d5
196    vswp            d14, d7
197    vswp            d16, d9
198    vswp            d18, d11
199
200    /* Pass 2 */
201    idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
202    /* Transpose */
203    transpose_4x4   d4,  d6,  d8,  d10
204    transpose_4x4   d5,  d7,  d9,  d11
205    transpose_4x4   d12, d14, d16, d18
206    transpose_4x4   d13, d15, d17, d19
207    vswp            d12, d5
208    vswp            d14, d7
209    vswp            d16, d9
210    vswp            d18, d11
211
212    /* Descale and range limit */
213    vmov.s16        q15, #(0x80 << 5)
214    vqadd.s16       q2, q2, q15
215    vqadd.s16       q3, q3, q15
216    vqadd.s16       q4, q4, q15
217    vqadd.s16       q5, q5, q15
218    vqadd.s16       q6, q6, q15
219    vqadd.s16       q7, q7, q15
220    vqadd.s16       q8, q8, q15
221    vqadd.s16       q9, q9, q15
222    vqshrun.s16     d4, q2, #5
223    vqshrun.s16     d6, q3, #5
224    vqshrun.s16     d8, q4, #5
225    vqshrun.s16     d10, q5, #5
226    vqshrun.s16     d12, q6, #5
227    vqshrun.s16     d14, q7, #5
228    vqshrun.s16     d16, q8, #5
229    vqshrun.s16     d18, q9, #5
230
231    /* Store results to the output buffer */
232    .irp            x, d4, d6, d8, d10, d12, d14, d16, d18
233    ldr             TMP, [OUTPUT_BUF], #4
234    add             TMP, TMP, OUTPUT_COL
235    vst1.8          {\x}, [TMP]!
236    .endr
237
238    vpop            {d8-d15}
239    bx              lr
240
241    .unreq          DCT_TABLE
242    .unreq          COEF_BLOCK
243    .unreq          OUTPUT_BUF
244    .unreq          OUTPUT_COL
245    .unreq          TMP
246.endfunc
247
248.purgem idct_helper
249
250/*****************************************************************************/
251
252/*
253 * jsimd_idct_4x4_neon
254 *
255 * This function contains inverse-DCT code for getting reduced-size
256 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
257 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
258 * function from jpeg-6b (jidctred.c).
259 *
260 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
261 *       requires much less arithmetic operations and hence should be faster.
262 *       The primary purpose of this particular NEON optimized function is
263 *       bit exact compatibility with jpeg-6b.
264 *
265 * TODO: a bit better instructions scheduling can be achieved by expanding
266 *       idct_helper/transpose_4x4 macros and reordering instructions,
267 *       but readability will suffer somewhat.
268 */
269
270#define CONST_BITS  13
271
272#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
273#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
274#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
275#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
276#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
277#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
278#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
279#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
280#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
281#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
282#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
283#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
284#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
285#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
286
287.balign 16
288jsimd_idct_4x4_neon_consts:
289    .short     FIX_1_847759065     /* d0[0] */
290    .short     -FIX_0_765366865    /* d0[1] */
291    .short     -FIX_0_211164243    /* d0[2] */
292    .short     FIX_1_451774981     /* d0[3] */
293    .short     -FIX_2_172734803    /* d1[0] */
294    .short     FIX_1_061594337     /* d1[1] */
295    .short     -FIX_0_509795579    /* d1[2] */
296    .short     -FIX_0_601344887    /* d1[3] */
297    .short     FIX_0_899976223     /* d2[0] */
298    .short     FIX_2_562915447     /* d2[1] */
299    .short     1 << (CONST_BITS+1) /* d2[2] */
300    .short     0                   /* d2[3] */
301
302.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
303    vmull.s16       q14, \x4,  d2[2]
304    vmlal.s16       q14, \x8,  d0[0]
305    vmlal.s16       q14, \x14, d0[1]
306
307    vmull.s16       q13, \x16, d1[2]
308    vmlal.s16       q13, \x12, d1[3]
309    vmlal.s16       q13, \x10, d2[0]
310    vmlal.s16       q13, \x6,  d2[1]
311
312    vmull.s16       q15, \x4,  d2[2]
313    vmlsl.s16       q15, \x8,  d0[0]
314    vmlsl.s16       q15, \x14, d0[1]
315
316    vmull.s16       q12, \x16, d0[2]
317    vmlal.s16       q12, \x12, d0[3]
318    vmlal.s16       q12, \x10, d1[0]
319    vmlal.s16       q12, \x6,  d1[1]
320
321    vadd.s32        q10, q14, q13
322    vsub.s32        q14, q14, q13
323
324.if \shift > 16
325    vrshr.s32       q10,  q10, #\shift
326    vrshr.s32       q14,  q14, #\shift
327    vmovn.s32       \y26, q10
328    vmovn.s32       \y29, q14
329.else
330    vrshrn.s32      \y26, q10, #\shift
331    vrshrn.s32      \y29, q14, #\shift
332.endif
333
334    vadd.s32        q10, q15, q12
335    vsub.s32        q15, q15, q12
336
337.if \shift > 16
338    vrshr.s32       q10,  q10, #\shift
339    vrshr.s32       q15,  q15, #\shift
340    vmovn.s32       \y27, q10
341    vmovn.s32       \y28, q15
342.else
343    vrshrn.s32      \y27, q10, #\shift
344    vrshrn.s32      \y28, q15, #\shift
345.endif
346
347.endm
348
349asm_function jsimd_idct_4x4_neon
350
351    DCT_TABLE       .req r0
352    COEF_BLOCK      .req r1
353    OUTPUT_BUF      .req r2
354    OUTPUT_COL      .req r3
355    TMP1            .req r0
356    TMP2            .req r1
357    TMP3            .req r2
358    TMP4            .req ip
359
360    vpush           {d8-d15}
361
362    /* Load constants (d3 is just used for padding) */
363    adr             TMP4, jsimd_idct_4x4_neon_consts
364    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
365
366    /* Load all COEF_BLOCK into NEON registers with the following allocation:
367     *       0 1 2 3 | 4 5 6 7
368     *      ---------+--------
369     *   0 | d4      | d5
370     *   1 | d6      | d7
371     *   2 | d8      | d9
372     *   3 | d10     | d11
373     *   4 | -       | -
374     *   5 | d12     | d13
375     *   6 | d14     | d15
376     *   7 | d16     | d17
377     */
378    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
379    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
380    add COEF_BLOCK, COEF_BLOCK, #16
381    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
382    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
383    /* dequantize */
384    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
385    vmul.s16        q2, q2, q9
386    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
387    vmul.s16        q3, q3, q10
388    vmul.s16        q4, q4, q11
389    add             DCT_TABLE, DCT_TABLE, #16
390    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
391    vmul.s16        q5, q5, q12
392    vmul.s16        q6, q6, q13
393    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
394    vmul.s16        q7, q7, q14
395    vmul.s16        q8, q8, q15
396
397    /* Pass 1 */
398    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
399    transpose_4x4   d4, d6, d8, d10
400    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
401    transpose_4x4   d5, d7, d9, d11
402
403    /* Pass 2 */
404    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
405    transpose_4x4   d26, d27, d28, d29
406
407    /* Range limit */
408    vmov.u16        q15, #0x80
409    vadd.s16        q13, q13, q15
410    vadd.s16        q14, q14, q15
411    vqmovun.s16     d26, q13
412    vqmovun.s16     d27, q14
413
414    /* Store results to the output buffer */
415    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
416    add             TMP1, TMP1, OUTPUT_COL
417    add             TMP2, TMP2, OUTPUT_COL
418    add             TMP3, TMP3, OUTPUT_COL
419    add             TMP4, TMP4, OUTPUT_COL
420
421#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
422    /* We can use much less instructions on little endian systems if the
423     * OS kernel is not configured to trap unaligned memory accesses
424     */
425    vst1.32         {d26[0]}, [TMP1]!
426    vst1.32         {d27[0]}, [TMP3]!
427    vst1.32         {d26[1]}, [TMP2]!
428    vst1.32         {d27[1]}, [TMP4]!
429#else
430    vst1.8          {d26[0]}, [TMP1]!
431    vst1.8          {d27[0]}, [TMP3]!
432    vst1.8          {d26[1]}, [TMP1]!
433    vst1.8          {d27[1]}, [TMP3]!
434    vst1.8          {d26[2]}, [TMP1]!
435    vst1.8          {d27[2]}, [TMP3]!
436    vst1.8          {d26[3]}, [TMP1]!
437    vst1.8          {d27[3]}, [TMP3]!
438
439    vst1.8          {d26[4]}, [TMP2]!
440    vst1.8          {d27[4]}, [TMP4]!
441    vst1.8          {d26[5]}, [TMP2]!
442    vst1.8          {d27[5]}, [TMP4]!
443    vst1.8          {d26[6]}, [TMP2]!
444    vst1.8          {d27[6]}, [TMP4]!
445    vst1.8          {d26[7]}, [TMP2]!
446    vst1.8          {d27[7]}, [TMP4]!
447#endif
448
449    vpop            {d8-d15}
450    bx              lr
451
452    .unreq          DCT_TABLE
453    .unreq          COEF_BLOCK
454    .unreq          OUTPUT_BUF
455    .unreq          OUTPUT_COL
456    .unreq          TMP1
457    .unreq          TMP2
458    .unreq          TMP3
459    .unreq          TMP4
460.endfunc
461
462.purgem idct_helper
463
464/*****************************************************************************/
465
466/*
467 * jsimd_idct_2x2_neon
468 *
469 * This function contains inverse-DCT code for getting reduced-size
470 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
471 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
472 * function from jpeg-6b (jidctred.c).
473 *
474 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
475 *       requires much less arithmetic operations and hence should be faster.
476 *       The primary purpose of this particular NEON optimized function is
477 *       bit exact compatibility with jpeg-6b.
478 */
479
480.balign 8
481jsimd_idct_2x2_neon_consts:
482    .short     -FIX_0_720959822    /* d0[0] */
483    .short     FIX_0_850430095     /* d0[1] */
484    .short     -FIX_1_272758580    /* d0[2] */
485    .short     FIX_3_624509785     /* d0[3] */
486
487.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
488    vshll.s16  q14,  \x4,  #15
489    vmull.s16  q13,  \x6,  d0[3]
490    vmlal.s16  q13,  \x10, d0[2]
491    vmlal.s16  q13,  \x12, d0[1]
492    vmlal.s16  q13,  \x16, d0[0]
493
494    vadd.s32   q10,  q14,  q13
495    vsub.s32   q14,  q14,  q13
496
497.if \shift > 16
498    vrshr.s32  q10,  q10,  #\shift
499    vrshr.s32  q14,  q14,  #\shift
500    vmovn.s32  \y26, q10
501    vmovn.s32  \y27, q14
502.else
503    vrshrn.s32 \y26, q10,  #\shift
504    vrshrn.s32 \y27, q14,  #\shift
505.endif
506
507.endm
508
509asm_function jsimd_idct_2x2_neon
510
511    DCT_TABLE       .req r0
512    COEF_BLOCK      .req r1
513    OUTPUT_BUF      .req r2
514    OUTPUT_COL      .req r3
515    TMP1            .req r0
516    TMP2            .req ip
517
518    vpush           {d8-d15}
519
520    /* Load constants */
521    adr             TMP2, jsimd_idct_2x2_neon_consts
522    vld1.16         {d0}, [TMP2, :64]
523
524    /* Load all COEF_BLOCK into NEON registers with the following allocation:
525     *       0 1 2 3 | 4 5 6 7
526     *      ---------+--------
527     *   0 | d4      | d5
528     *   1 | d6      | d7
529     *   2 | -       | -
530     *   3 | d10     | d11
531     *   4 | -       | -
532     *   5 | d12     | d13
533     *   6 | -       | -
534     *   7 | d16     | d17
535     */
536    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
537    add             COEF_BLOCK, COEF_BLOCK, #16
538    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
539    add             COEF_BLOCK, COEF_BLOCK, #16
540    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
541    add             COEF_BLOCK, COEF_BLOCK, #16
542    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
543    /* Dequantize */
544    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
545    vmul.s16        q2, q2, q9
546    vmul.s16        q3, q3, q10
547    add             DCT_TABLE, DCT_TABLE, #16
548    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
549    vmul.s16        q5, q5, q12
550    add             DCT_TABLE, DCT_TABLE, #16
551    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
552    vmul.s16        q6, q6, q13
553    add             DCT_TABLE, DCT_TABLE, #16
554    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
555    vmul.s16        q8, q8, q15
556
557    /* Pass 1 */
558#if 0
559    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
560    transpose_4x4   d4, d6, d8,  d10
561    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
562    transpose_4x4   d5, d7, d9,  d11
563#else
564    vmull.s16       q13, d6,  d0[3]
565    vmlal.s16       q13, d10, d0[2]
566    vmlal.s16       q13, d12, d0[1]
567    vmlal.s16       q13, d16, d0[0]
568    vmull.s16       q12, d7,  d0[3]
569    vmlal.s16       q12, d11, d0[2]
570    vmlal.s16       q12, d13, d0[1]
571    vmlal.s16       q12, d17, d0[0]
572    vshll.s16       q14, d4,  #15
573    vshll.s16       q15, d5,  #15
574    vadd.s32        q10, q14, q13
575    vsub.s32        q14, q14, q13
576    vrshrn.s32      d4,  q10, #13
577    vrshrn.s32      d6,  q14, #13
578    vadd.s32        q10, q15, q12
579    vsub.s32        q14, q15, q12
580    vrshrn.s32      d5,  q10, #13
581    vrshrn.s32      d7,  q14, #13
582    vtrn.16         q2,  q3
583    vtrn.32         q3,  q5
584#endif
585
586    /* Pass 2 */
587    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
588
589    /* Range limit */
590    vmov.u16        q15, #0x80
591    vadd.s16        q13, q13, q15
592    vqmovun.s16     d26, q13
593    vqmovun.s16     d27, q13
594
595    /* Store results to the output buffer */
596    ldmia           OUTPUT_BUF, {TMP1, TMP2}
597    add             TMP1, TMP1, OUTPUT_COL
598    add             TMP2, TMP2, OUTPUT_COL
599
600    vst1.8          {d26[0]}, [TMP1]!
601    vst1.8          {d27[4]}, [TMP1]!
602    vst1.8          {d26[1]}, [TMP2]!
603    vst1.8          {d27[5]}, [TMP2]!
604
605    vpop            {d8-d15}
606    bx              lr
607
608    .unreq          DCT_TABLE
609    .unreq          COEF_BLOCK
610    .unreq          OUTPUT_BUF
611    .unreq          OUTPUT_COL
612    .unreq          TMP1
613    .unreq          TMP2
614.endfunc
615
616.purgem idct_helper
617
618/*****************************************************************************/
619
620/*
621 * jsimd_ycc_extrgb_convert_neon
622 * jsimd_ycc_extbgr_convert_neon
623 * jsimd_ycc_extrgbx_convert_neon
624 * jsimd_ycc_extbgrx_convert_neon
625 * jsimd_ycc_extxbgr_convert_neon
626 * jsimd_ycc_extxrgb_convert_neon
627 *
628 * Colorspace conversion YCbCr -> RGB
629 */
630
631
632.macro do_load size
633    .if \size == 8
634        vld1.8  {d4}, [U]!
635        vld1.8  {d5}, [V]!
636        vld1.8  {d0}, [Y]!
637        pld     [Y, #64]
638        pld     [U, #64]
639        pld     [V, #64]
640    .elseif \size == 4
641        vld1.8  {d4[0]}, [U]!
642        vld1.8  {d4[1]}, [U]!
643        vld1.8  {d4[2]}, [U]!
644        vld1.8  {d4[3]}, [U]!
645        vld1.8  {d5[0]}, [V]!
646        vld1.8  {d5[1]}, [V]!
647        vld1.8  {d5[2]}, [V]!
648        vld1.8  {d5[3]}, [V]!
649        vld1.8  {d0[0]}, [Y]!
650        vld1.8  {d0[1]}, [Y]!
651        vld1.8  {d0[2]}, [Y]!
652        vld1.8  {d0[3]}, [Y]!
653    .elseif \size == 2
654        vld1.8  {d4[4]}, [U]!
655        vld1.8  {d4[5]}, [U]!
656        vld1.8  {d5[4]}, [V]!
657        vld1.8  {d5[5]}, [V]!
658        vld1.8  {d0[4]}, [Y]!
659        vld1.8  {d0[5]}, [Y]!
660    .elseif \size == 1
661        vld1.8  {d4[6]}, [U]!
662        vld1.8  {d5[6]}, [V]!
663        vld1.8  {d0[6]}, [Y]!
664    .else
665        .error unsupported macroblock size
666    .endif
667.endm
668
669.macro do_store bpp, size
670    .if \bpp == 24
671        .if \size == 8
672            vst3.8  {d10, d11, d12}, [RGB]!
673        .elseif \size == 4
674            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
675            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
676            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
677            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
678        .elseif \size == 2
679            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
680            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
681        .elseif \size == 1
682            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
683        .else
684            .error unsupported macroblock size
685        .endif
686    .elseif \bpp == 32
687        .if \size == 8
688            vst4.8  {d10, d11, d12, d13}, [RGB]!
689        .elseif \size == 4
690            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
691            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
692            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
693            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
694        .elseif \size == 2
695            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
696            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
697        .elseif \size == 1
698            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
699        .else
700            .error unsupported macroblock size
701        .endif
702    .else
703        .error unsupported bpp
704    .endif
705.endm
706
707.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
708
709.macro do_yuv_to_rgb
710    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
711    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
712    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
713    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
714    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
715    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
716    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
717    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
718    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
719    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
720    vrshrn.s32      d20, q10, #15
721    vrshrn.s32      d21, q11, #15
722    vrshrn.s32      d24, q12, #14
723    vrshrn.s32      d25, q13, #14
724    vrshrn.s32      d28, q14, #14
725    vrshrn.s32      d29, q15, #14
726    vaddw.u8        q10, q10, d0
727    vaddw.u8        q12, q12, d0
728    vaddw.u8        q14, q14, d0
729    vqmovun.s16     d1\g_offs, q10
730    vqmovun.s16     d1\r_offs, q12
731    vqmovun.s16     d1\b_offs, q14
732.endm
733
734/* Apple gas crashes on adrl, work around that by using adr.
735 * But this requires a copy of these constants for each function.
736 */
737
738.balign 16
739jsimd_ycc_\colorid\()_neon_consts:
740    .short          0,      0,     0,      0
741    .short          22971, -11277, -23401, 29033
742    .short          -128,  -128,   -128,   -128
743    .short          -128,  -128,   -128,   -128
744
745asm_function jsimd_ycc_\colorid\()_convert_neon
746    OUTPUT_WIDTH    .req r0
747    INPUT_BUF       .req r1
748    INPUT_ROW       .req r2
749    OUTPUT_BUF      .req r3
750    NUM_ROWS        .req r4
751
752    INPUT_BUF0      .req r5
753    INPUT_BUF1      .req r6
754    INPUT_BUF2      .req INPUT_BUF
755
756    RGB             .req r7
757    Y               .req r8
758    U               .req r9
759    V               .req r10
760    N               .req ip
761
762    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
763    adr             ip, jsimd_ycc_\colorid\()_neon_consts
764    vld1.16         {d0, d1, d2, d3}, [ip, :128]
765
766    /* Save ARM registers and handle input arguments */
767    push            {r4, r5, r6, r7, r8, r9, r10, lr}
768    ldr             NUM_ROWS, [sp, #(4 * 8)]
769    ldr             INPUT_BUF0, [INPUT_BUF]
770    ldr             INPUT_BUF1, [INPUT_BUF, #4]
771    ldr             INPUT_BUF2, [INPUT_BUF, #8]
772    .unreq          INPUT_BUF
773
774    /* Save NEON registers */
775    vpush           {d8-d15}
776
777    /* Initially set d10, d11, d12, d13 to 0xFF */
778    vmov.u8         q5, #255
779    vmov.u8         q6, #255
780
781    /* Outer loop over scanlines */
782    cmp             NUM_ROWS, #1
783    blt             9f
7840:
785    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
786    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
787    mov             N, OUTPUT_WIDTH
788    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
789    add             INPUT_ROW, INPUT_ROW, #1
790    ldr             RGB, [OUTPUT_BUF], #4
791
792    /* Inner loop over pixels */
793    subs            N, N, #8
794    blt             2f
7951:
796    do_load         8
797    do_yuv_to_rgb
798    do_store        \bpp, 8
799    subs            N, N, #8
800    bge             1b
801    tst             N, #7
802    beq             8f
8032:
804    tst             N, #4
805    beq             3f
806    do_load         4
8073:
808    tst             N, #2
809    beq             4f
810    do_load         2
8114:
812    tst             N, #1
813    beq             5f
814    do_load         1
8155:
816    do_yuv_to_rgb
817    tst             N, #4
818    beq             6f
819    do_store        \bpp, 4
8206:
821    tst             N, #2
822    beq             7f
823    do_store        \bpp, 2
8247:
825    tst             N, #1
826    beq             8f
827    do_store        \bpp, 1
8288:
829    subs            NUM_ROWS, NUM_ROWS, #1
830    bgt             0b
8319:
832    /* Restore all registers and return */
833    vpop            {d8-d15}
834    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
835
836    .unreq          OUTPUT_WIDTH
837    .unreq          INPUT_ROW
838    .unreq          OUTPUT_BUF
839    .unreq          NUM_ROWS
840    .unreq          INPUT_BUF0
841    .unreq          INPUT_BUF1
842    .unreq          INPUT_BUF2
843    .unreq          RGB
844    .unreq          Y
845    .unreq          U
846    .unreq          V
847    .unreq          N
848.endfunc
849
850.purgem do_yuv_to_rgb
851
852.endm
853
854/*--------------------------------- id ----- bpp R  G  B */
855generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
856generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
857generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
858generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
859generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
860generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
861
862.purgem do_load
863.purgem do_store
864
865/*****************************************************************************/
866