1/*
2 * ARM NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
7 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty.  In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 *    claim that you wrote the original software. If you use this software
18 *    in a product, an acknowledgment in the product documentation would be
19 *    appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 *    misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24/* Copyright (c) 2011,  NVIDIA CORPORATION. All rights reserved.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 *
30 *  * Redistributions of source code must retain the above copyright
31 *    notice, this list of conditions and the following disclaimer.
32 *  * Redistributions in binary form must reproduce the above copyright
33 *    notice, this list of conditions and the following disclaimer in the
34 *    documentation and/or other materials provided with the distribution.
35 *  * Neither the name of the NVIDIA CORPORATION nor the names of its
36 *    contributors may be used to endorse or promote products derived
37 *    from this software without specific prior written permission.
38 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
39 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
42 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
43 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
44 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
45 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
46 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
47 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
48 * THE POSSIBILITY OF SUCH DAMAGE.
49 */
50
51
52
53#if defined(__linux__) && defined(__ELF__)
54.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
55#endif
56
57.text
58.fpu neon
59.arch armv7a
60.object_arch armv7a
61.arm
62
63
64#define RESPECT_STRICT_ALIGNMENT 1
65
66/*****************************************************************************/
67
68/* Supplementary macro for setting function attributes */
69.macro asm_function fname
70    .func \fname
71    .global \fname
72#ifdef __ELF__
73    .hidden \fname
74    .type \fname, %function
75#endif
76\fname:
77.endm
78
79/* Transpose a block of 4x4 coefficients in four 64-bit registers */
80.macro transpose_4x4 x0, x1, x2, x3
81    vtrn.16 \x0, \x1
82    vtrn.16 \x2, \x3
83    vtrn.32 \x0, \x2
84    vtrn.32 \x1, \x3
85.endm
86
87/*****************************************************************************/
88
89/*
90 * jsimd_idct_ifast_neon
91 *
92 * This function contains a fast, not so accurate integer implementation of
93 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
94 * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
95 * function from jidctfst.c
96 *
97 * TODO: a bit better instructions scheduling is needed.
98 */
99
100#define XFIX_1_082392200 d0[0]
101#define XFIX_1_414213562 d0[1]
102#define XFIX_1_847759065 d0[2]
103#define XFIX_2_613125930 d0[3]
104
105.balign 16
106jsimd_idct_ifast_neon_consts:
107    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
108    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
109    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
110    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
111
112/* 1-D IDCT helper macro */
113
114.macro idct_helper  x0, x1, x2, x3, x4, x5, x6, x7, \
115                    t10, t11, t12, t13, t14
116
117    vsub.s16        \t10, \x0, \x4
118    vadd.s16        \x4,  \x0, \x4
119    vswp.s16        \t10, \x0
120    vsub.s16        \t11, \x2, \x6
121    vadd.s16        \x6,  \x2, \x6
122    vswp.s16        \t11, \x2
123    vsub.s16        \t10, \x3, \x5
124    vadd.s16        \x5,  \x3, \x5
125    vswp.s16        \t10, \x3
126    vsub.s16        \t11, \x1, \x7
127    vadd.s16        \x7,  \x1, \x7
128    vswp.s16        \t11, \x1
129
130    vqdmulh.s16     \t13, \x2,  d0[1]
131    vadd.s16        \t12, \x3,  \x3
132    vadd.s16        \x2,  \x2,  \t13
133    vqdmulh.s16     \t13, \x3,  d0[3]
134    vsub.s16        \t10,  \x1, \x3
135    vadd.s16        \t12, \t12, \t13
136    vqdmulh.s16     \t13, \t10, d0[2]
137    vsub.s16        \t11, \x7,  \x5
138    vadd.s16        \t10, \t10, \t13
139    vqdmulh.s16     \t13, \t11, d0[1]
140    vadd.s16        \t11, \t11, \t13
141
142    vqdmulh.s16     \t13, \x1,  d0[0]
143    vsub.s16        \x2,  \x6,  \x2
144    vsub.s16        \t14, \x0,  \x2
145    vadd.s16        \x2,  \x0,  \x2
146    vadd.s16        \x0,  \x4,  \x6
147    vsub.s16        \x4,  \x4,  \x6
148    vadd.s16        \x1,  \x1,  \t13
149    vadd.s16        \t13, \x7,  \x5
150    vsub.s16        \t12, \t13, \t12
151    vsub.s16        \t12, \t12, \t10
152    vadd.s16        \t11, \t12, \t11
153    vsub.s16        \t10, \x1,  \t10
154    vadd.s16        \t10, \t10, \t11
155
156    vsub.s16        \x7,  \x0,  \t13
157    vadd.s16        \x0,  \x0,  \t13
158    vadd.s16        \x6,  \t14, \t12
159    vsub.s16        \x1,  \t14, \t12
160    vsub.s16        \x5,  \x2,  \t11
161    vadd.s16        \x2,  \x2,  \t11
162    vsub.s16        \x3,  \x4,  \t10
163    vadd.s16        \x4,  \x4,  \t10
164.endm
165
166asm_function jsimd_idct_ifast_neon
167
168    DCT_TABLE       .req r0
169    COEF_BLOCK      .req r1
170    OUTPUT_BUF      .req r2
171    OUTPUT_COL      .req r3
172    TMP             .req ip
173
174    vpush           {d8-d15}
175
176    /* Load constants */
177    adr             TMP, jsimd_idct_ifast_neon_consts
178    vld1.16         {d0}, [TMP, :64]
179
180    /* Load all COEF_BLOCK into NEON registers with the following allocation:
181     *       0 1 2 3 | 4 5 6 7
182     *      ---------+--------
183     *   0 | d4      | d5
184     *   1 | d6      | d7
185     *   2 | d8      | d9
186     *   3 | d10     | d11
187     *   4 | d12     | d13
188     *   5 | d14     | d15
189     *   6 | d16     | d17
190     *   7 | d18     | d19
191     */
192    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
193    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]!
194    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]!
195    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK]!
196    /* Dequantize */
197    vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
198    vmul.s16        q2, q2, q10
199    vld1.16         {d24, d25, d26, d27}, [DCT_TABLE]!
200    vmul.s16        q3, q3, q11
201    vmul.s16        q4, q4, q12
202    vld1.16         {d28, d29, d30, d31}, [DCT_TABLE]!
203    vmul.s16        q5, q5, q13
204    vmul.s16        q6, q6, q14
205    vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
206    vmul.s16        q7, q7, q15
207    vmul.s16        q8, q8, q10
208    vmul.s16        q9, q9, q11
209
210    /* Pass 1 : process columns from input, store into work array.*/
211    idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
212    /* Transpose */
213    vtrn.16 q2, q3
214    vtrn.16 q4, q5
215    vtrn.32 q2, q4
216    vtrn.32 q3, q5
217
218    vtrn.16 q6, q7
219    vtrn.16 q8, q9
220    vtrn.32 q6, q8
221    vtrn.32 q7, q9
222
223    vswp            d12, d5
224    vswp            d14, d7
225    vswp            d16, d9
226    vswp            d18, d11
227
228    /* Pass 2 */
229    idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
230    /* Transpose */
231
232    vtrn.16 q2, q3
233    vtrn.16 q4, q5
234    vtrn.32 q2, q4
235    vtrn.32 q3, q5
236
237    vtrn.16 q6, q7
238    vtrn.16 q8, q9
239    vtrn.32 q6, q8
240    vtrn.32 q7, q9
241
242    vswp            d12, d5
243    vswp            d14, d7
244    vswp            d16, d9
245    vswp            d18, d11
246
247    /* Descale and range limit */
248    vmov.s16        q15, #(0x80 << 5)
249    vqadd.s16       q2, q2, q15
250    vqadd.s16       q3, q3, q15
251    vqadd.s16       q4, q4, q15
252    vqadd.s16       q5, q5, q15
253    vqadd.s16       q6, q6, q15
254    vqadd.s16       q7, q7, q15
255    vqadd.s16       q8, q8, q15
256    vqadd.s16       q9, q9, q15
257    vqshrun.s16     d4, q2, #5
258    vqshrun.s16     d6, q3, #5
259    vqshrun.s16     d8, q4, #5
260    vqshrun.s16     d10, q5, #5
261    vqshrun.s16     d12, q6, #5
262    vqshrun.s16     d14, q7, #5
263    vqshrun.s16     d16, q8, #5
264    vqshrun.s16     d18, q9, #5
265
266    /* Store results to the output buffer */
267    .irp            x, d4, d6, d8, d10, d12, d14, d16, d18
268    ldr             TMP, [OUTPUT_BUF], #4
269    add             TMP, TMP, OUTPUT_COL
270    vst1.8          {\x}, [TMP]!
271    .endr
272
273    vpop            {d8-d15}
274    bx              lr
275
276    .unreq          DCT_TABLE
277    .unreq          COEF_BLOCK
278    .unreq          OUTPUT_BUF
279    .unreq          OUTPUT_COL
280    .unreq          TMP
281.endfunc
282
283.purgem idct_helper
284
285/*****************************************************************************/
286
287/*
288 * jsimd_idct_4x4_neon
289 *
290 * This function contains inverse-DCT code for getting reduced-size
291 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
292 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
293 * function from jpeg-6b (jidctred.c).
294 *
295 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
296 *       requires much less arithmetic operations and hence should be faster.
297 *       The primary purpose of this particular NEON optimized function is
298 *       bit exact compatibility with jpeg-6b.
299 *
300 * TODO: a bit better instructions scheduling can be achieved by expanding
301 *       idct_helper/transpose_4x4 macros and reordering instructions,
302 *       but readability will suffer somewhat.
303 */
304
305#define CONST_BITS  13
306
307#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
308#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
309#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
310#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
311#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
312#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
313#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
314#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
315#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
316#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
317#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
318#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
319#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
320#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
321
322.balign 16
323jsimd_idct_4x4_neon_consts:
324    .short     FIX_1_847759065     /* d0[0] */
325    .short     -FIX_0_765366865    /* d0[1] */
326    .short     -FIX_0_211164243    /* d0[2] */
327    .short     FIX_1_451774981     /* d0[3] */
328    .short     -FIX_2_172734803    /* d1[0] */
329    .short     FIX_1_061594337     /* d1[1] */
330    .short     -FIX_0_509795579    /* d1[2] */
331    .short     -FIX_0_601344887    /* d1[3] */
332    .short     FIX_0_899976223     /* d2[0] */
333    .short     FIX_2_562915447     /* d2[1] */
334    .short     1 << (CONST_BITS+1) /* d2[2] */
335    .short     0                   /* d2[3] */
336
337.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
338    vmull.s16       q14, \x4,  d2[2]
339    vmlal.s16       q14, \x8,  d0[0]
340    vmlal.s16       q14, \x14, d0[1]
341
342    vmull.s16       q13, \x16, d1[2]
343    vmlal.s16       q13, \x12, d1[3]
344    vmlal.s16       q13, \x10, d2[0]
345    vmlal.s16       q13, \x6,  d2[1]
346
347    vmull.s16       q15, \x4,  d2[2]
348    vmlsl.s16       q15, \x8,  d0[0]
349    vmlsl.s16       q15, \x14, d0[1]
350
351    vmull.s16       q12, \x16, d0[2]
352    vmlal.s16       q12, \x12, d0[3]
353    vmlal.s16       q12, \x10, d1[0]
354    vmlal.s16       q12, \x6,  d1[1]
355
356    vadd.s32        q10, q14, q13
357    vsub.s32        q14, q14, q13
358
359.if \shift > 16
360    vrshr.s32       q10,  q10, #\shift
361    vrshr.s32       q14,  q14, #\shift
362    vmovn.s32       \y26, q10
363    vmovn.s32       \y29, q14
364.else
365    vrshrn.s32      \y26, q10, #\shift
366    vrshrn.s32      \y29, q14, #\shift
367.endif
368
369    vadd.s32        q10, q15, q12
370    vsub.s32        q15, q15, q12
371
372.if \shift > 16
373    vrshr.s32       q10,  q10, #\shift
374    vrshr.s32       q15,  q15, #\shift
375    vmovn.s32       \y27, q10
376    vmovn.s32       \y28, q15
377.else
378    vrshrn.s32      \y27, q10, #\shift
379    vrshrn.s32      \y28, q15, #\shift
380.endif
381
382.endm
383
384asm_function jsimd_idct_4x4_neon
385
386    DCT_TABLE       .req r0
387    COEF_BLOCK      .req r1
388    OUTPUT_BUF      .req r2
389    OUTPUT_COL      .req r3
390    TMP1            .req r0
391    TMP2            .req r1
392    TMP3            .req r2
393    TMP4            .req ip
394
395    vpush           {d8-d15}
396
397    /* Load constants (d3 is just used for padding) */
398    adr             TMP4, jsimd_idct_4x4_neon_consts
399    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
400
401    /* Load all COEF_BLOCK into NEON registers with the following allocation:
402     *       0 1 2 3 | 4 5 6 7
403     *      ---------+--------
404     *   0 | d4      | d5
405     *   1 | d6      | d7
406     *   2 | d8      | d9
407     *   3 | d10     | d11
408     *   4 | -       | -
409     *   5 | d12     | d13
410     *   6 | d14     | d15
411     *   7 | d16     | d17
412     */
413    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
414    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]!
415    add COEF_BLOCK, COEF_BLOCK, #16
416    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]!
417    vld1.16         {d16, d17}, [COEF_BLOCK]!
418    /* dequantize */
419    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE]!
420    vmul.s16        q2, q2, q9
421    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE]!
422    vmul.s16        q3, q3, q10
423    vmul.s16        q4, q4, q11
424    add             DCT_TABLE, DCT_TABLE, #16
425    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE]!
426    vmul.s16        q5, q5, q12
427    vmul.s16        q6, q6, q13
428    vld1.16         {d30, d31}, [DCT_TABLE]!
429    vmul.s16        q7, q7, q14
430    vmul.s16        q8, q8, q15
431
432
433    /* Pass 1 */
434    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
435    transpose_4x4   d4, d6, d8, d10
436    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
437    transpose_4x4   d5, d7, d9, d11
438
439    /* Pass 2 */
440    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
441    transpose_4x4   d26, d27, d28, d29
442
443    /* Range limit */
444    vmov.u16        q15, #0x80
445    vadd.s16        q13, q13, q15
446    vadd.s16        q14, q14, q15
447    vqmovun.s16     d26, q13
448    vqmovun.s16     d27, q14
449
450    /* Store results to the output buffer */
451    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
452    add             TMP1, TMP1, OUTPUT_COL
453    add             TMP2, TMP2, OUTPUT_COL
454    add             TMP3, TMP3, OUTPUT_COL
455    add             TMP4, TMP4, OUTPUT_COL
456
457#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
458    /* We can use much less instructions on little endian systems if the
459     * OS kernel is not configured to trap unaligned memory accesses
460     */
461    vst1.32         {d26[0]}, [TMP1]!
462    vst1.32         {d27[0]}, [TMP3]!
463    vst1.32         {d26[1]}, [TMP2]!
464    vst1.32         {d27[1]}, [TMP4]!
465#else
466    vst1.8          {d26[0]}, [TMP1]!
467    vst1.8          {d27[0]}, [TMP3]!
468    vst1.8          {d26[1]}, [TMP1]!
469    vst1.8          {d27[1]}, [TMP3]!
470    vst1.8          {d26[2]}, [TMP1]!
471    vst1.8          {d27[2]}, [TMP3]!
472    vst1.8          {d26[3]}, [TMP1]!
473    vst1.8          {d27[3]}, [TMP3]!
474
475    vst1.8          {d26[4]}, [TMP2]!
476    vst1.8          {d27[4]}, [TMP4]!
477    vst1.8          {d26[5]}, [TMP2]!
478    vst1.8          {d27[5]}, [TMP4]!
479    vst1.8          {d26[6]}, [TMP2]!
480    vst1.8          {d27[6]}, [TMP4]!
481    vst1.8          {d26[7]}, [TMP2]!
482    vst1.8          {d27[7]}, [TMP4]!
483#endif
484
485    vpop            {d8-d15}
486    bx              lr
487
488    .unreq          DCT_TABLE
489    .unreq          COEF_BLOCK
490    .unreq          OUTPUT_BUF
491    .unreq          OUTPUT_COL
492    .unreq          TMP1
493    .unreq          TMP2
494    .unreq          TMP3
495    .unreq          TMP4
496.endfunc
497
498.purgem idct_helper
499
500/*****************************************************************************/
501
502/*
503 * jsimd_idct_2x2_neon
504 *
505 * This function contains inverse-DCT code for getting reduced-size
506 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
507 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
508 * function from jpeg-6b (jidctred.c).
509 *
510 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
511 *       requires much less arithmetic operations and hence should be faster.
512 *       The primary purpose of this particular NEON optimized function is
513 *       bit exact compatibility with jpeg-6b.
514 */
515
516.balign 8
517jsimd_idct_2x2_neon_consts:
518    .short     -FIX_0_720959822    /* d0[0] */
519    .short     FIX_0_850430095     /* d0[1] */
520    .short     -FIX_1_272758580    /* d0[2] */
521    .short     FIX_3_624509785     /* d0[3] */
522
523.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
524    vshll.s16  q14,  \x4,  #15
525    vmull.s16  q13,  \x6,  d0[3]
526    vmlal.s16  q13,  \x10, d0[2]
527    vmlal.s16  q13,  \x12, d0[1]
528    vmlal.s16  q13,  \x16, d0[0]
529
530    vadd.s32   q10,  q14,  q13
531    vsub.s32   q14,  q14,  q13
532
533.if \shift > 16
534    vrshr.s32  q10,  q10,  #\shift
535    vrshr.s32  q14,  q14,  #\shift
536    vmovn.s32  \y26, q10
537    vmovn.s32  \y27, q14
538.else
539    vrshrn.s32 \y26, q10,  #\shift
540    vrshrn.s32 \y27, q14,  #\shift
541.endif
542
543.endm
544
545asm_function jsimd_idct_2x2_neon
546
547    DCT_TABLE       .req r0
548    COEF_BLOCK      .req r1
549    OUTPUT_BUF      .req r2
550    OUTPUT_COL      .req r3
551    TMP1            .req r0
552    TMP2            .req ip
553
554    vpush           {d8-d15}
555
556    /* Load constants */
557    adr             TMP2, jsimd_idct_2x2_neon_consts
558    vld1.16         {d0}, [TMP2, :64]
559
560    /* Load all COEF_BLOCK into NEON registers with the following allocation:
561     *       0 1 2 3 | 4 5 6 7
562     *      ---------+--------
563     *   0 | d4      | d5
564     *   1 | d6      | d7
565     *   2 | -       | -
566     *   3 | d10     | d11
567     *   4 | -       | -
568     *   5 | d12     | d13
569     *   6 | -       | -
570     *   7 | d16     | d17
571     */
572
573    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
574    add             COEF_BLOCK, COEF_BLOCK, #16
575    vld1.16         {d10, d11}, [COEF_BLOCK]!
576    add             COEF_BLOCK, COEF_BLOCK, #16
577    vld1.16         {d12, d13}, [COEF_BLOCK]!
578    add             COEF_BLOCK, COEF_BLOCK, #16
579    vld1.16         {d16, d17}, [COEF_BLOCK]!
580    /* Dequantize */
581    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE]!
582    vmul.s16        q2, q2, q9
583    vmul.s16        q3, q3, q10
584    add             DCT_TABLE, DCT_TABLE, #16
585    vld1.16         {d24, d25}, [DCT_TABLE]!
586    vmul.s16        q5, q5, q12
587    add             DCT_TABLE, DCT_TABLE, #16
588    vld1.16         {d26, d27}, [DCT_TABLE]!
589    vmul.s16        q6, q6, q13
590    add             DCT_TABLE, DCT_TABLE, #16
591    vld1.16         {d30, d31}, [DCT_TABLE]!
592    vmul.s16        q8, q8, q15
593
594    /* Pass 1 */
595    vmull.s16       q13, d6,  d0[3]
596    vmlal.s16       q13, d10, d0[2]
597    vmlal.s16       q13, d12, d0[1]
598    vmlal.s16       q13, d16, d0[0]
599    vmull.s16       q12, d7,  d0[3]
600    vmlal.s16       q12, d11, d0[2]
601    vmlal.s16       q12, d13, d0[1]
602    vmlal.s16       q12, d17, d0[0]
603    vshll.s16       q14, d4,  #15
604    vshll.s16       q15, d5,  #15
605    vadd.s32        q10, q14, q13
606    vsub.s32        q14, q14, q13
607    vrshrn.s32      d4,  q10, #13
608    vrshrn.s32      d6,  q14, #13
609    vadd.s32        q10, q15, q12
610    vsub.s32        q14, q15, q12
611    vrshrn.s32      d5,  q10, #13
612    vrshrn.s32      d7,  q14, #13
613    vtrn.16         q2,  q3
614    vtrn.32         q3,  q5
615
616    /* Pass 2 */
617    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
618
619    /* Range limit */
620    vmov.u16        q15, #0x80
621    vadd.s16        q13, q13, q15
622    vqmovun.s16     d26, q13
623    vqmovun.s16     d27, q13
624
625    /* Store results to the output buffer */
626    ldmia           OUTPUT_BUF, {TMP1, TMP2}
627    add             TMP1, TMP1, OUTPUT_COL
628    add             TMP2, TMP2, OUTPUT_COL
629
630    vst1.8          {d26[0]}, [TMP1]!
631    vst1.8          {d27[4]}, [TMP1]!
632    vst1.8          {d26[1]}, [TMP2]!
633    vst1.8          {d27[5]}, [TMP2]!
634
635    vpop            {d8-d15}
636    bx              lr
637
638    .unreq          DCT_TABLE
639    .unreq          COEF_BLOCK
640    .unreq          OUTPUT_BUF
641    .unreq          OUTPUT_COL
642    .unreq          TMP1
643    .unreq          TMP2
644.endfunc
645
646.purgem idct_helper
647
648/*****************************************************************************/
649
650/*
651 * jsimd_ycc_rgba8888_convert_neon
652 * jsimd_ycc_rgb565_convert_neon
653 * Colorspace conversion YCbCr -> RGB
654 */
655
656
657.macro do_load size
658    .if \size == 8
659        vld1.8  {d4}, [U]!
660        vld1.8  {d5}, [V]!
661        vld1.8  {d0}, [Y]!
662        pld     [Y, #64]
663        pld     [U, #64]
664        pld     [V, #64]
665    .elseif \size == 4
666        vld1.8  {d4[0]}, [U]!
667        vld1.8  {d4[1]}, [U]!
668        vld1.8  {d4[2]}, [U]!
669        vld1.8  {d4[3]}, [U]!
670        vld1.8  {d5[0]}, [V]!
671        vld1.8  {d5[1]}, [V]!
672        vld1.8  {d5[2]}, [V]!
673        vld1.8  {d5[3]}, [V]!
674        vld1.8  {d0[0]}, [Y]!
675        vld1.8  {d0[1]}, [Y]!
676        vld1.8  {d0[2]}, [Y]!
677        vld1.8  {d0[3]}, [Y]!
678    .elseif \size == 2
679        vld1.8  {d4[4]}, [U]!
680        vld1.8  {d4[5]}, [U]!
681        vld1.8  {d5[4]}, [V]!
682        vld1.8  {d5[5]}, [V]!
683        vld1.8  {d0[4]}, [Y]!
684        vld1.8  {d0[5]}, [Y]!
685    .elseif \size == 1
686        vld1.8  {d4[6]}, [U]!
687        vld1.8  {d5[6]}, [V]!
688        vld1.8  {d0[6]}, [Y]!
689    .else
690        .error unsupported macroblock size
691    .endif
692.endm
693
694
695
696
697
698.macro do_store bpp, size
699    .if \bpp == 16
700            /* if 16 bits, pack into RGB565 format */
701            vmov      d27, d10          /* insert red channel */
702            vsri.u8   d27, d11, #5      /* shift and insert the green channel */
703            vsli.u8   d26, d11, #3
704            vsri.u8   d26, d12, #3     /* shift and insert the blue channel */
705
706        .if \size == 8
707            vst2.8  {d26, d27}, [RGB]!
708        .elseif \size == 4
709            vst2.8  {d26[0], d27[0]}, [RGB]!
710            vst2.8  {d26[1], d27[1]}, [RGB]!
711            vst2.8  {d26[2], d27[2]}, [RGB]!
712            vst2.8  {d26[3], d27[3]}, [RGB]!
713        .elseif \size == 2
714            vst2.8  {d26[4], d27[4]}, [RGB]!
715            vst2.8  {d26[5], d27[5]}, [RGB]!
716        .elseif \size == 1
717            vst2.8  {d26[6], d27[6]}, [RGB]!
718        .else
719            .error unsupported macroblock size
720        .endif
721    .elseif \bpp == 24
722        .if \size == 8
723            vst3.8  {d10, d11, d12}, [RGB]!
724        .elseif \size == 4
725            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
726            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
727            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
728            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
729        .elseif \size == 2
730            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
731            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
732        .elseif \size == 1
733            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
734        .else
735            .error unsupported macroblock size
736        .endif
737    .elseif \bpp == 32
738        .if \size == 8
739            vst4.8  {d10, d11, d12, d13}, [RGB]!
740        .elseif \size == 4
741            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
742            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
743            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
744            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
745        .elseif \size == 2
746            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
747            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
748        .elseif \size == 1
749            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
750        .else
751            .error unsupported macroblock size
752        .endif
753    .else
754        .error unsupported bpp
755    .endif
756.endm
757
758.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
759
760.macro do_yuv_to_rgb
761    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
762    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
763    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
764    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
765    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
766    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
767    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
768    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
769    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
770    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
771    vrshrn.s32      d20, q10, #15
772    vrshrn.s32      d21, q11, #15
773    vrshrn.s32      d24, q12, #14
774    vrshrn.s32      d25, q13, #14
775    vrshrn.s32      d28, q14, #14
776    vrshrn.s32      d29, q15, #14
777    vaddw.u8        q10, q10, d0
778    vaddw.u8        q12, q12, d0
779    vaddw.u8        q14, q14, d0
780    vqmovun.s16     d1\g_offs, q10
781    vqmovun.s16     d1\r_offs, q12
782    vqmovun.s16     d1\b_offs, q14
783.endm
784
785/* Apple gas crashes on adrl, work around that by using adr.
786 * But this requires a copy of these constants for each function.
787 */
788
789.balign 16
790jsimd_ycc_\colorid\()_neon_consts:
791    .short          0,      0,     0,      0
792    .short          22971, -11277, -23401, 29033
793    .short          -128,  -128,   -128,   -128
794    .short          -128,  -128,   -128,   -128
795
796asm_function jsimd_ycc_\colorid\()_convert_neon
797    OUTPUT_WIDTH    .req r0
798    INPUT_BUF       .req r1
799    INPUT_ROW       .req r2
800    OUTPUT_BUF      .req r3
801    NUM_ROWS        .req r4
802
803    INPUT_BUF0      .req r5
804    INPUT_BUF1      .req r6
805    INPUT_BUF2      .req INPUT_BUF
806
807    RGB             .req r7
808    Y               .req r8
809    U               .req r9
810    V               .req r10
811    N               .req ip
812
813    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
814    adr             ip, jsimd_ycc_\colorid\()_neon_consts
815    vld1.16         {d0, d1, d2, d3}, [ip, :128]
816
817    /* Save ARM registers and handle input arguments */
818    push            {r4, r5, r6, r7, r8, r9, r10, lr}
819    ldr             NUM_ROWS, [sp, #(4 * 8)]
820    ldr             INPUT_BUF0, [INPUT_BUF]
821    ldr             INPUT_BUF1, [INPUT_BUF, #4]
822    ldr             INPUT_BUF2, [INPUT_BUF, #8]
823    .unreq          INPUT_BUF
824
825    /* Save NEON registers */
826    vpush           {d8-d15}
827
828    /* Initially set d10, d11, d12, d13 to 0xFF */
829    vmov.u8         q5, #255
830    vmov.u8         q6, #255
831
832    /* Outer loop over scanlines */
833    cmp             NUM_ROWS, #1
834    blt             9f
8350:
836    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
837    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
838    mov             N, OUTPUT_WIDTH
839    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
840    add             INPUT_ROW, INPUT_ROW, #1
841    ldr             RGB, [OUTPUT_BUF], #4
842
843    /* Inner loop over pixels */
844    subs            N, N, #8
845    blt             2f
8461:
847    do_load         8
848    do_yuv_to_rgb
849    do_store        \bpp, 8
850    subs            N, N, #8
851    bge             1b
852    tst             N, #7
853    beq             8f
8542:
855    tst             N, #4
856    beq             3f
857    do_load         4
8583:
859    tst             N, #2
860    beq             4f
861    do_load         2
8624:
863    tst             N, #1
864    beq             5f
865    do_load         1
8665:
867    do_yuv_to_rgb
868    tst             N, #4
869    beq             6f
870    do_store        \bpp, 4
8716:
872    tst             N, #2
873    beq             7f
874    do_store        \bpp, 2
8757:
876    tst             N, #1
877    beq             8f
878    do_store        \bpp, 1
8798:
880    subs            NUM_ROWS, NUM_ROWS, #1
881    bgt             0b
8829:
883    /* Restore all registers and return */
884    vpop            {d8-d15}
885    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
886
887    .unreq          OUTPUT_WIDTH
888    .unreq          INPUT_ROW
889    .unreq          OUTPUT_BUF
890    .unreq          NUM_ROWS
891    .unreq          INPUT_BUF0
892    .unreq          INPUT_BUF1
893    .unreq          INPUT_BUF2
894    .unreq          RGB
895    .unreq          Y
896    .unreq          U
897    .unreq          V
898    .unreq          N
899.endfunc
900
901.purgem do_yuv_to_rgb
902
903.endm
904
905/*--------------------------------- id ----- bpp R  G  B */
906generate_jsimd_ycc_rgb_convert_neon rgba8888, 32, 0, 1, 2
907generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 1, 2
908
909
910.purgem do_load
911.purgem do_store
912
913/*****************************************************************************/
914