jsimd_arm_neon.S revision 3e00f03aea551192233e143bbc63c435e09a3afe
1/*
2 * ARMv7 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty.  In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 *    claim that you wrote the original software. If you use this software
18 *    in a product, an acknowledgment in the product documentation would be
19 *    appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 *    misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25#if defined(__linux__) && defined(__ELF__)
26.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
27#endif
28
29.text
30.fpu neon
31.arch armv7a
32.object_arch armv4
33.arm
34
35
36#define RESPECT_STRICT_ALIGNMENT 1
37
38
39/*****************************************************************************/
40
41/* Supplementary macro for setting function attributes */
42.macro asm_function fname
43#ifdef __APPLE__
44    .func _\fname
45    .globl _\fname
46_\fname:
47#else
48    .func \fname
49    .global \fname
50#ifdef __ELF__
51    .hidden \fname
52    .type \fname, %function
53#endif
54\fname:
55#endif
56.endm
57
58/* Transpose a block of 4x4 coefficients in four 64-bit registers */
59.macro transpose_4x4 x0, x1, x2, x3
60    vtrn.16 \x0, \x1
61    vtrn.16 \x2, \x3
62    vtrn.32 \x0, \x2
63    vtrn.32 \x1, \x3
64.endm
65
66
67#define CENTERJSAMPLE 128
68
69/*****************************************************************************/
70
71/*
72 * Perform dequantization and inverse DCT on one block of coefficients.
73 *
74 * GLOBAL(void)
75 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
76 *                        JSAMPARRAY output_buf, JDIMENSION output_col)
77 */
78
79#define FIX_0_298631336  (2446)
80#define FIX_0_390180644  (3196)
81#define FIX_0_541196100  (4433)
82#define FIX_0_765366865  (6270)
83#define FIX_0_899976223  (7373)
84#define FIX_1_175875602  (9633)
85#define FIX_1_501321110  (12299)
86#define FIX_1_847759065  (15137)
87#define FIX_1_961570560  (16069)
88#define FIX_2_053119869  (16819)
89#define FIX_2_562915447  (20995)
90#define FIX_3_072711026  (25172)
91
92#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
93#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
94#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
95#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
96#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
97#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
98#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
99#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
100
101/*
102 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
103 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
104 */
105#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
106{                                                                             \
107    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
108    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
109    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
110                                                                              \
111    /* 1-D iDCT input data */                                                 \
112    row0 = xrow0;                                                             \
113    row1 = xrow1;                                                             \
114    row2 = xrow2;                                                             \
115    row3 = xrow3;                                                             \
116    row4 = xrow4;                                                             \
117    row5 = xrow5;                                                             \
118    row6 = xrow6;                                                             \
119    row7 = xrow7;                                                             \
120                                                                              \
121    q5 = row7 + row3;                                                         \
122    q4 = row5 + row1;                                                         \
123    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
124         MULTIPLY(q4, FIX_1_175875602);                                       \
125    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
126         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
127    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
128         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
129    q4 = q6;                                                                  \
130    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
131    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
132          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
133    /* now we can use q1 (reloadable constants have been used up) */          \
134    q1 = q3 + q2;                                                             \
135    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
136          MULTIPLY(row1, -FIX_0_899976223);                                   \
137    q5 = q7;                                                                  \
138    q1 = q1 + q6;                                                             \
139    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
140          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
141                                                                              \
142    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
143    tmp11_plus_tmp2 = q1;                                                     \
144    row1 = 0;                                                                 \
145                                                                              \
146    q1 = q1 - q6;                                                             \
147    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
148          MULTIPLY(row3, -FIX_2_562915447);                                   \
149    q1 = q1 - q6;                                                             \
150    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
151         MULTIPLY(row6, FIX_0_541196100);                                     \
152    q3 = q3 - q2;                                                             \
153                                                                              \
154    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
155    tmp11_minus_tmp2 = q1;                                                    \
156                                                                              \
157    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
158    q2 = q1 + q6;                                                             \
159    q1 = q1 - q6;                                                             \
160                                                                              \
161    /* pick up the results */                                                 \
162    tmp0  = q4;                                                               \
163    tmp1  = q5;                                                               \
164    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
165    tmp3  = q7;                                                               \
166    tmp10 = q2;                                                               \
167    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
168    tmp12 = q3;                                                               \
169    tmp13 = q1;                                                               \
170}
171
172#define XFIX_0_899976223                    d0[0]
173#define XFIX_0_541196100                    d0[1]
174#define XFIX_2_562915447                    d0[2]
175#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
176#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
177#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
178#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
179#define XFIX_1_175875602                    d1[3]
180#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
181#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
182#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
183#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
184
185.balign 16
186jsimd_idct_islow_neon_consts:
187    .short FIX_0_899976223                    /* d0[0] */
188    .short FIX_0_541196100                    /* d0[1] */
189    .short FIX_2_562915447                    /* d0[2] */
190    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
191    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
192    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
193    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
194    .short FIX_1_175875602                    /* d1[3] */
195    /* reloadable constants */
196    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
197    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
198    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
199    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
200
201asm_function jsimd_idct_islow_neon
202
203    DCT_TABLE       .req r0
204    COEF_BLOCK      .req r1
205    OUTPUT_BUF      .req r2
206    OUTPUT_COL      .req r3
207    TMP1            .req r0
208    TMP2            .req r1
209    TMP3            .req r2
210    TMP4            .req ip
211
212    ROW0L           .req d16
213    ROW0R           .req d17
214    ROW1L           .req d18
215    ROW1R           .req d19
216    ROW2L           .req d20
217    ROW2R           .req d21
218    ROW3L           .req d22
219    ROW3R           .req d23
220    ROW4L           .req d24
221    ROW4R           .req d25
222    ROW5L           .req d26
223    ROW5R           .req d27
224    ROW6L           .req d28
225    ROW6R           .req d29
226    ROW7L           .req d30
227    ROW7R           .req d31
228
229    /* Load and dequantize coefficients into NEON registers
230     * with the following allocation:
231     *       0 1 2 3 | 4 5 6 7
232     *      ---------+--------
233     *   0 | d16     | d17     ( q8  )
234     *   1 | d18     | d19     ( q9  )
235     *   2 | d20     | d21     ( q10 )
236     *   3 | d22     | d23     ( q11 )
237     *   4 | d24     | d25     ( q12 )
238     *   5 | d26     | d27     ( q13 )
239     *   6 | d28     | d29     ( q14 )
240     *   7 | d30     | d31     ( q15 )
241     */
242    adr             ip, jsimd_idct_islow_neon_consts
243    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
244    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
245    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
246    vmul.s16        q8, q8, q0
247    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
248    vmul.s16        q9, q9, q1
249    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
250    vmul.s16        q10, q10, q2
251    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
252    vmul.s16        q11, q11, q3
253    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
254    vmul.s16        q12, q12, q0
255    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
256    vmul.s16        q14, q14, q2
257    vmul.s16        q13, q13, q1
258    vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
259    add             ip, ip, #16
260    vmul.s16        q15, q15, q3
261    vpush           {d8-d15} /* save NEON registers */
262    /* 1-D IDCT, pass 1, left 4x8 half */
263    vadd.s16        d4,    ROW7L, ROW3L
264    vadd.s16        d5,    ROW5L, ROW1L
265    vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
266    vmlal.s16       q6,    d5,    XFIX_1_175875602
267    vmull.s16       q7,    d4,    XFIX_1_175875602
268      /* Check for the zero coefficients in the right 4x8 half */
269      push            {r4, r5}
270    vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
271    vsubl.s16       q3,    ROW0L, ROW4L
272      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
273    vmull.s16       q2,    ROW2L, XFIX_0_541196100
274    vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
275      orr             r0,    r4,    r5
276    vmov            q4,    q6
277    vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
278      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
279    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
280    vshl.s32        q3,    q3,    #13
281      orr             r0,    r0,    r4
282    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
283      orr             r0,    r0,    r5
284    vadd.s32        q1,    q3,    q2
285      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
286    vmov            q5,    q7
287    vadd.s32        q1,    q1,    q6
288      orr             r0,    r0,    r4
289    vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
290      orr             r0,    r0,    r5
291    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
292    vrshrn.s32      ROW1L, q1,    #11
293      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
294    vsub.s32        q1,    q1,    q6
295    vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
296      orr             r0,    r0,    r4
297    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
298      orr             r0,    r0,    r5
299    vsub.s32        q1,    q1,    q6
300    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
301      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
302    vmlal.s16       q6,    ROW6L, XFIX_0_541196100
303    vsub.s32        q3,    q3,    q2
304      orr             r0,    r0,    r4
305    vrshrn.s32      ROW6L, q1,    #11
306      orr             r0,    r0,    r5
307    vadd.s32        q1,    q3,    q5
308      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
309    vsub.s32        q3,    q3,    q5
310    vaddl.s16       q5,    ROW0L, ROW4L
311      orr             r0,    r0,    r4
312    vrshrn.s32      ROW2L, q1,    #11
313      orr             r0,    r0,    r5
314    vrshrn.s32      ROW5L, q3,    #11
315      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
316    vshl.s32        q5,    q5,    #13
317    vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
318      orr             r0,    r0,    r4
319    vadd.s32        q2,    q5,    q6
320      orrs            r0,    r0,    r5
321    vsub.s32        q1,    q5,    q6
322    vadd.s32        q6,    q2,    q7
323      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
324    vsub.s32        q2,    q2,    q7
325    vadd.s32        q5,    q1,    q4
326      orr             r0,    r4,    r5
327    vsub.s32        q3,    q1,    q4
328      pop             {r4, r5}
329    vrshrn.s32      ROW7L, q2,    #11
330    vrshrn.s32      ROW3L, q5,    #11
331    vrshrn.s32      ROW0L, q6,    #11
332    vrshrn.s32      ROW4L, q3,    #11
333
334      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
335
336    /* 1-D IDCT, pass 1, right 4x8 half */
337    vld1.s16        {d2},  [ip, :64]    /* reload constants */
338    vadd.s16        d10,   ROW7R, ROW3R
339    vadd.s16        d8,    ROW5R, ROW1R
340      /* Transpose left 4x8 half */
341      vtrn.16         ROW6L, ROW7L
342    vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
343    vmlal.s16       q6,    d8,    XFIX_1_175875602
344      vtrn.16         ROW2L, ROW3L
345    vmull.s16       q7,    d10,   XFIX_1_175875602
346    vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
347      vtrn.16         ROW0L, ROW1L
348    vsubl.s16       q3,    ROW0R, ROW4R
349    vmull.s16       q2,    ROW2R, XFIX_0_541196100
350    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
351      vtrn.16         ROW4L, ROW5L
352    vmov            q4,    q6
353    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
354    vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
355      vtrn.32         ROW1L, ROW3L
356    vshl.s32        q3,    q3,    #13
357    vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
358      vtrn.32         ROW4L, ROW6L
359    vadd.s32        q1,    q3,    q2
360    vmov            q5,    q7
361    vadd.s32        q1,    q1,    q6
362      vtrn.32         ROW0L, ROW2L
363    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
364    vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
365    vrshrn.s32      ROW1R, q1,    #11
366      vtrn.32         ROW5L, ROW7L
367    vsub.s32        q1,    q1,    q6
368    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
369    vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
370    vsub.s32        q1,    q1,    q6
371    vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
372    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
373    vsub.s32        q3,    q3,    q2
374    vrshrn.s32      ROW6R, q1,    #11
375    vadd.s32        q1,    q3,    q5
376    vsub.s32        q3,    q3,    q5
377    vaddl.s16       q5,    ROW0R, ROW4R
378    vrshrn.s32      ROW2R, q1,    #11
379    vrshrn.s32      ROW5R, q3,    #11
380    vshl.s32        q5,    q5,    #13
381    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
382    vadd.s32        q2,    q5,    q6
383    vsub.s32        q1,    q5,    q6
384    vadd.s32        q6,    q2,    q7
385    vsub.s32        q2,    q2,    q7
386    vadd.s32        q5,    q1,    q4
387    vsub.s32        q3,    q1,    q4
388    vrshrn.s32      ROW7R, q2,    #11
389    vrshrn.s32      ROW3R, q5,    #11
390    vrshrn.s32      ROW0R, q6,    #11
391    vrshrn.s32      ROW4R, q3,    #11
392    /* Transpose right 4x8 half */
393    vtrn.16         ROW6R, ROW7R
394    vtrn.16         ROW2R, ROW3R
395    vtrn.16         ROW0R, ROW1R
396    vtrn.16         ROW4R, ROW5R
397    vtrn.32         ROW1R, ROW3R
398    vtrn.32         ROW4R, ROW6R
399    vtrn.32         ROW0R, ROW2R
400    vtrn.32         ROW5R, ROW7R
401
4021:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
403    vld1.s16        {d2},  [ip, :64]    /* reload constants */
404    vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
405    vmlal.s16       q6,    ROW1L, XFIX_1_175875602
406    vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
407    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
408    vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
409    vmlal.s16       q7,    ROW3L, XFIX_1_175875602
410    vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
411    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
412    vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
413    vmull.s16       q2,    ROW2L, XFIX_0_541196100
414    vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
415    vmov            q4,    q6
416    vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
417    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
418    vshl.s32        q3,    q3,    #13
419    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
420    vadd.s32        q1,    q3,    q2
421    vmov            q5,    q7
422    vadd.s32        q1,    q1,    q6
423    vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
424    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
425    vshrn.s32       ROW1L, q1,    #16
426    vsub.s32        q1,    q1,    q6
427    vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
428    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
429    vsub.s32        q1,    q1,    q6
430    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
431    vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
432    vsub.s32        q3,    q3,    q2
433    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
434    vadd.s32        q1,    q3,    q5
435    vsub.s32        q3,    q3,    q5
436    vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
437    vshrn.s32       ROW2L, q1,    #16
438    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
439    vshl.s32        q5,    q5,    #13
440    vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
441    vadd.s32        q2,    q5,    q6
442    vsub.s32        q1,    q5,    q6
443    vadd.s32        q6,    q2,    q7
444    vsub.s32        q2,    q2,    q7
445    vadd.s32        q5,    q1,    q4
446    vsub.s32        q3,    q1,    q4
447    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
448    vshrn.s32       ROW3L, q5,    #16
449    vshrn.s32       ROW0L, q6,    #16
450    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
451    /* 1-D IDCT, pass 2, right 4x8 half */
452    vld1.s16        {d2},  [ip, :64]    /* reload constants */
453    vmull.s16       q6,    ROW5R, XFIX_1_175875602
454    vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
455    vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
456    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
457    vmull.s16       q7,    ROW7R, XFIX_1_175875602
458    vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
459    vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
460    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
461    vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
462    vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
463    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
464    vmov            q4,    q6
465    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
466    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
467    vshl.s32        q3,    q3,    #13
468    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
469    vadd.s32        q1,    q3,    q2
470    vmov            q5,    q7
471    vadd.s32        q1,    q1,    q6
472    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
473    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
474    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
475    vsub.s32        q1,    q1,    q6
476    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
477    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
478    vsub.s32        q1,    q1,    q6
479    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
480    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
481    vsub.s32        q3,    q3,    q2
482    vshrn.s32       ROW6R, q1,    #16
483    vadd.s32        q1,    q3,    q5
484    vsub.s32        q3,    q3,    q5
485    vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
486    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
487    vshrn.s32       ROW5R, q3,    #16
488    vshl.s32        q5,    q5,    #13
489    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
490    vadd.s32        q2,    q5,    q6
491    vsub.s32        q1,    q5,    q6
492    vadd.s32        q6,    q2,    q7
493    vsub.s32        q2,    q2,    q7
494    vadd.s32        q5,    q1,    q4
495    vsub.s32        q3,    q1,    q4
496    vshrn.s32       ROW7R, q2,    #16
497    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
498    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
499    vshrn.s32       ROW4R, q3,    #16
500
5012:  /* Descale to 8-bit and range limit */
502    vqrshrn.s16     d16,   q8,    #2
503    vqrshrn.s16     d17,   q9,    #2
504    vqrshrn.s16     d18,   q10,   #2
505    vqrshrn.s16     d19,   q11,   #2
506    vpop            {d8-d15} /* restore NEON registers */
507    vqrshrn.s16     d20,   q12,   #2
508      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
509      vtrn.16         q8,    q9
510    vqrshrn.s16     d21,   q13,   #2
511    vqrshrn.s16     d22,   q14,   #2
512      vmov.u8         q0,    #(CENTERJSAMPLE)
513    vqrshrn.s16     d23,   q15,   #2
514      vtrn.8          d16,   d17
515      vtrn.8          d18,   d19
516      vadd.u8         q8,    q8,    q0
517      vadd.u8         q9,    q9,    q0
518      vtrn.16         q10,   q11
519        /* Store results to the output buffer */
520        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
521        add             TMP1, TMP1, OUTPUT_COL
522        add             TMP2, TMP2, OUTPUT_COL
523        vst1.8          {d16}, [TMP1]
524      vtrn.8          d20, d21
525        vst1.8          {d17}, [TMP2]
526        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
527        add             TMP1, TMP1, OUTPUT_COL
528        add             TMP2, TMP2, OUTPUT_COL
529        vst1.8          {d18}, [TMP1]
530      vadd.u8         q10,   q10,   q0
531        vst1.8          {d19}, [TMP2]
532        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
533        add             TMP1, TMP1, OUTPUT_COL
534        add             TMP2, TMP2, OUTPUT_COL
535        add             TMP3, TMP3, OUTPUT_COL
536        add             TMP4, TMP4, OUTPUT_COL
537      vtrn.8          d22, d23
538        vst1.8          {d20}, [TMP1]
539      vadd.u8         q11,   q11,   q0
540        vst1.8          {d21}, [TMP2]
541        vst1.8          {d22}, [TMP3]
542        vst1.8          {d23}, [TMP4]
543    bx              lr
544
5453:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
546
547    /* Transpose left 4x8 half */
548    vtrn.16         ROW6L, ROW7L
549    vtrn.16         ROW2L, ROW3L
550    vtrn.16         ROW0L, ROW1L
551    vtrn.16         ROW4L, ROW5L
552    vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
553    vtrn.32         ROW1L, ROW3L
554    vtrn.32         ROW4L, ROW6L
555    vtrn.32         ROW0L, ROW2L
556    vtrn.32         ROW5L, ROW7L
557
558    cmp             r0, #0
559    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
560
561    /* Only row 0 is non-zero for the right 4x8 half  */
562    vdup.s16        ROW1R, ROW0R[1]
563    vdup.s16        ROW2R, ROW0R[2]
564    vdup.s16        ROW3R, ROW0R[3]
565    vdup.s16        ROW4R, ROW0R[0]
566    vdup.s16        ROW5R, ROW0R[1]
567    vdup.s16        ROW6R, ROW0R[2]
568    vdup.s16        ROW7R, ROW0R[3]
569    vdup.s16        ROW0R, ROW0R[0]
570    b               1b /* Go to 'normal' second pass */
571
5724:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
573    vld1.s16        {d2},  [ip, :64]    /* reload constants */
574    vmull.s16       q6,    ROW1L, XFIX_1_175875602
575    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
576    vmull.s16       q7,    ROW3L, XFIX_1_175875602
577    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
578    vmull.s16       q2,    ROW2L, XFIX_0_541196100
579    vshll.s16       q3,    ROW0L, #13
580    vmov            q4,    q6
581    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
582    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
583    vadd.s32        q1,    q3,    q2
584    vmov            q5,    q7
585    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
586    vadd.s32        q1,    q1,    q6
587    vadd.s32        q6,    q6,    q6
588    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
589    vshrn.s32       ROW1L, q1,    #16
590    vsub.s32        q1,    q1,    q6
591    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
592    vsub.s32        q3,    q3,    q2
593    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
594    vadd.s32        q1,    q3,    q5
595    vsub.s32        q3,    q3,    q5
596    vshll.s16       q5,    ROW0L, #13
597    vshrn.s32       ROW2L, q1,    #16
598    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
599    vadd.s32        q2,    q5,    q6
600    vsub.s32        q1,    q5,    q6
601    vadd.s32        q6,    q2,    q7
602    vsub.s32        q2,    q2,    q7
603    vadd.s32        q5,    q1,    q4
604    vsub.s32        q3,    q1,    q4
605    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
606    vshrn.s32       ROW3L, q5,    #16
607    vshrn.s32       ROW0L, q6,    #16
608    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
609    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
610    vld1.s16        {d2},  [ip, :64]    /* reload constants */
611    vmull.s16       q6,    ROW5L, XFIX_1_175875602
612    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
613    vmull.s16       q7,    ROW7L, XFIX_1_175875602
614    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
615    vmull.s16       q2,    ROW6L, XFIX_0_541196100
616    vshll.s16       q3,    ROW4L, #13
617    vmov            q4,    q6
618    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
619    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
620    vadd.s32        q1,    q3,    q2
621    vmov            q5,    q7
622    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
623    vadd.s32        q1,    q1,    q6
624    vadd.s32        q6,    q6,    q6
625    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
626    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
627    vsub.s32        q1,    q1,    q6
628    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
629    vsub.s32        q3,    q3,    q2
630    vshrn.s32       ROW6R, q1,    #16
631    vadd.s32        q1,    q3,    q5
632    vsub.s32        q3,    q3,    q5
633    vshll.s16       q5,    ROW4L, #13
634    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
635    vshrn.s32       ROW5R, q3,    #16
636    vadd.s32        q2,    q5,    q6
637    vsub.s32        q1,    q5,    q6
638    vadd.s32        q6,    q2,    q7
639    vsub.s32        q2,    q2,    q7
640    vadd.s32        q5,    q1,    q4
641    vsub.s32        q3,    q1,    q4
642    vshrn.s32       ROW7R, q2,    #16
643    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
644    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
645    vshrn.s32       ROW4R, q3,    #16
646    b               2b /* Go to epilogue */
647
648    .unreq          DCT_TABLE
649    .unreq          COEF_BLOCK
650    .unreq          OUTPUT_BUF
651    .unreq          OUTPUT_COL
652    .unreq          TMP1
653    .unreq          TMP2
654    .unreq          TMP3
655    .unreq          TMP4
656
657    .unreq          ROW0L
658    .unreq          ROW0R
659    .unreq          ROW1L
660    .unreq          ROW1R
661    .unreq          ROW2L
662    .unreq          ROW2R
663    .unreq          ROW3L
664    .unreq          ROW3R
665    .unreq          ROW4L
666    .unreq          ROW4R
667    .unreq          ROW5L
668    .unreq          ROW5R
669    .unreq          ROW6L
670    .unreq          ROW6R
671    .unreq          ROW7L
672    .unreq          ROW7R
673.endfunc
674
675
676/*****************************************************************************/
677
678/*
679 * jsimd_idct_ifast_neon
680 *
681 * This function contains a fast, not so accurate integer implementation of
682 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
683 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
684 * function from jidctfst.c
685 *
686 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
687 * But in ARM NEON case some extra additions are required because VQDMULH
688 * instruction can't handle the constants larger than 1. So the expressions
689 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
690 * which introduces an extra addition. Overall, there are 6 extra additions
691 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
692 */
693
694#define XFIX_1_082392200 d0[0]
695#define XFIX_1_414213562 d0[1]
696#define XFIX_1_847759065 d0[2]
697#define XFIX_2_613125930 d0[3]
698
699.balign 16
700jsimd_idct_ifast_neon_consts:
701    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
702    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
703    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
704    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
705
706asm_function jsimd_idct_ifast_neon
707
708    DCT_TABLE       .req r0
709    COEF_BLOCK      .req r1
710    OUTPUT_BUF      .req r2
711    OUTPUT_COL      .req r3
712    TMP1            .req r0
713    TMP2            .req r1
714    TMP3            .req r2
715    TMP4            .req ip
716
717    /* Load and dequantize coefficients into NEON registers
718     * with the following allocation:
719     *       0 1 2 3 | 4 5 6 7
720     *      ---------+--------
721     *   0 | d16     | d17     ( q8  )
722     *   1 | d18     | d19     ( q9  )
723     *   2 | d20     | d21     ( q10 )
724     *   3 | d22     | d23     ( q11 )
725     *   4 | d24     | d25     ( q12 )
726     *   5 | d26     | d27     ( q13 )
727     *   6 | d28     | d29     ( q14 )
728     *   7 | d30     | d31     ( q15 )
729     */
730    adr             ip, jsimd_idct_ifast_neon_consts
731    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
732    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
733    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
734    vmul.s16        q8,  q8,  q0
735    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
736    vmul.s16        q9,  q9,  q1
737    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
738    vmul.s16        q10, q10, q2
739    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
740    vmul.s16        q11, q11, q3
741    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
742    vmul.s16        q12, q12, q0
743    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
744    vmul.s16        q14, q14, q2
745    vmul.s16        q13, q13, q1
746    vld1.16         {d0}, [ip, :64] /* load constants */
747    vmul.s16        q15, q15, q3
748    vpush           {d8-d13}        /* save NEON registers */
749    /* 1-D IDCT, pass 1 */
750    vsub.s16        q2,  q10, q14
751    vadd.s16        q14, q10, q14
752    vsub.s16        q1,  q11, q13
753    vadd.s16        q13, q11, q13
754    vsub.s16        q5,  q9,  q15
755    vadd.s16        q15, q9,  q15
756    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
757    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
758    vadd.s16        q3,  q1,  q1
759    vsub.s16        q1,  q5,  q1
760    vadd.s16        q10, q2,  q4
761    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
762    vsub.s16        q2,  q15, q13
763    vadd.s16        q3,  q3,  q6
764    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
765    vadd.s16        q1,  q1,  q4
766    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
767    vsub.s16        q10, q10, q14
768    vadd.s16        q2,  q2,  q6
769    vsub.s16        q6,  q8,  q12
770    vadd.s16        q12, q8,  q12
771    vadd.s16        q9,  q5,  q4
772    vadd.s16        q5,  q6,  q10
773    vsub.s16        q10, q6,  q10
774    vadd.s16        q6,  q15, q13
775    vadd.s16        q8,  q12, q14
776    vsub.s16        q3,  q6,  q3
777    vsub.s16        q12, q12, q14
778    vsub.s16        q3,  q3,  q1
779    vsub.s16        q1,  q9,  q1
780    vadd.s16        q2,  q3,  q2
781    vsub.s16        q15, q8,  q6
782    vadd.s16        q1,  q1,  q2
783    vadd.s16        q8,  q8,  q6
784    vadd.s16        q14, q5,  q3
785    vsub.s16        q9,  q5,  q3
786    vsub.s16        q13, q10, q2
787    vadd.s16        q10, q10, q2
788      /* Transpose */
789      vtrn.16         q8,  q9
790    vsub.s16        q11, q12, q1
791      vtrn.16         q14, q15
792    vadd.s16        q12, q12, q1
793      vtrn.16         q10, q11
794      vtrn.16         q12, q13
795      vtrn.32         q9,  q11
796      vtrn.32         q12, q14
797      vtrn.32         q8,  q10
798      vtrn.32         q13, q15
799      vswp            d28, d21
800      vswp            d26, d19
801    /* 1-D IDCT, pass 2 */
802    vsub.s16        q2,  q10, q14
803      vswp            d30, d23
804    vadd.s16        q14, q10, q14
805      vswp            d24, d17
806    vsub.s16        q1,  q11, q13
807    vadd.s16        q13, q11, q13
808    vsub.s16        q5,  q9,  q15
809    vadd.s16        q15, q9,  q15
810    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
811    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
812    vadd.s16        q3,  q1,  q1
813    vsub.s16        q1,  q5,  q1
814    vadd.s16        q10, q2,  q4
815    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
816    vsub.s16        q2,  q15, q13
817    vadd.s16        q3,  q3,  q6
818    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
819    vadd.s16        q1,  q1,  q4
820    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
821    vsub.s16        q10, q10, q14
822    vadd.s16        q2,  q2,  q6
823    vsub.s16        q6,  q8,  q12
824    vadd.s16        q12, q8,  q12
825    vadd.s16        q9,  q5,  q4
826    vadd.s16        q5,  q6,  q10
827    vsub.s16        q10, q6,  q10
828    vadd.s16        q6,  q15, q13
829    vadd.s16        q8,  q12, q14
830    vsub.s16        q3,  q6,  q3
831    vsub.s16        q12, q12, q14
832    vsub.s16        q3,  q3,  q1
833    vsub.s16        q1,  q9,  q1
834    vadd.s16        q2,  q3,  q2
835    vsub.s16        q15, q8,  q6
836    vadd.s16        q1,  q1,  q2
837    vadd.s16        q8,  q8,  q6
838    vadd.s16        q14, q5,  q3
839    vsub.s16        q9,  q5,  q3
840    vsub.s16        q13, q10, q2
841    vpop            {d8-d13}        /* restore NEON registers */
842    vadd.s16        q10, q10, q2
843    vsub.s16        q11, q12, q1
844    vadd.s16        q12, q12, q1
845    /* Descale to 8-bit and range limit */
846    vmov.u8         q0,  #0x80
847    vqshrn.s16      d16, q8,  #5
848    vqshrn.s16      d17, q9,  #5
849    vqshrn.s16      d18, q10, #5
850    vqshrn.s16      d19, q11, #5
851    vqshrn.s16      d20, q12, #5
852    vqshrn.s16      d21, q13, #5
853    vqshrn.s16      d22, q14, #5
854    vqshrn.s16      d23, q15, #5
855    vadd.u8         q8,  q8,  q0
856    vadd.u8         q9,  q9,  q0
857    vadd.u8         q10, q10, q0
858    vadd.u8         q11, q11, q0
859    /* Transpose the final 8-bit samples */
860    vtrn.16         q8,  q9
861    vtrn.16         q10, q11
862    vtrn.32         q8,  q10
863    vtrn.32         q9,  q11
864    vtrn.8          d16, d17
865    vtrn.8          d18, d19
866      /* Store results to the output buffer */
867      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
868      add             TMP1, TMP1, OUTPUT_COL
869      add             TMP2, TMP2, OUTPUT_COL
870      vst1.8          {d16}, [TMP1]
871      vst1.8          {d17}, [TMP2]
872      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
873      add             TMP1, TMP1, OUTPUT_COL
874      add             TMP2, TMP2, OUTPUT_COL
875      vst1.8          {d18}, [TMP1]
876    vtrn.8          d20, d21
877      vst1.8          {d19}, [TMP2]
878      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
879      add             TMP1, TMP1, OUTPUT_COL
880      add             TMP2, TMP2, OUTPUT_COL
881      add             TMP3, TMP3, OUTPUT_COL
882      add             TMP4, TMP4, OUTPUT_COL
883      vst1.8          {d20}, [TMP1]
884    vtrn.8          d22, d23
885      vst1.8          {d21}, [TMP2]
886      vst1.8          {d22}, [TMP3]
887      vst1.8          {d23}, [TMP4]
888    bx              lr
889
890    .unreq          DCT_TABLE
891    .unreq          COEF_BLOCK
892    .unreq          OUTPUT_BUF
893    .unreq          OUTPUT_COL
894    .unreq          TMP1
895    .unreq          TMP2
896    .unreq          TMP3
897    .unreq          TMP4
898.endfunc
899
900
901/*****************************************************************************/
902
903/*
904 * jsimd_idct_4x4_neon
905 *
906 * This function contains inverse-DCT code for getting reduced-size
907 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
908 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
909 * function from jpeg-6b (jidctred.c).
910 *
911 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
912 *       requires much less arithmetic operations and hence should be faster.
913 *       The primary purpose of this particular NEON optimized function is
914 *       bit exact compatibility with jpeg-6b.
915 *
916 * TODO: a bit better instructions scheduling can be achieved by expanding
917 *       idct_helper/transpose_4x4 macros and reordering instructions,
918 *       but readability will suffer somewhat.
919 */
920
921#define CONST_BITS  13
922
923#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
924#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
925#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
926#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
927#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
928#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
929#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
930#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
931#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
932#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
933#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
934#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
935#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
936#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
937
938.balign 16
939jsimd_idct_4x4_neon_consts:
940    .short     FIX_1_847759065     /* d0[0] */
941    .short     -FIX_0_765366865    /* d0[1] */
942    .short     -FIX_0_211164243    /* d0[2] */
943    .short     FIX_1_451774981     /* d0[3] */
944    .short     -FIX_2_172734803    /* d1[0] */
945    .short     FIX_1_061594337     /* d1[1] */
946    .short     -FIX_0_509795579    /* d1[2] */
947    .short     -FIX_0_601344887    /* d1[3] */
948    .short     FIX_0_899976223     /* d2[0] */
949    .short     FIX_2_562915447     /* d2[1] */
950    .short     1 << (CONST_BITS+1) /* d2[2] */
951    .short     0                   /* d2[3] */
952
953.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
954    vmull.s16       q14, \x4,  d2[2]
955    vmlal.s16       q14, \x8,  d0[0]
956    vmlal.s16       q14, \x14, d0[1]
957
958    vmull.s16       q13, \x16, d1[2]
959    vmlal.s16       q13, \x12, d1[3]
960    vmlal.s16       q13, \x10, d2[0]
961    vmlal.s16       q13, \x6,  d2[1]
962
963    vmull.s16       q15, \x4,  d2[2]
964    vmlsl.s16       q15, \x8,  d0[0]
965    vmlsl.s16       q15, \x14, d0[1]
966
967    vmull.s16       q12, \x16, d0[2]
968    vmlal.s16       q12, \x12, d0[3]
969    vmlal.s16       q12, \x10, d1[0]
970    vmlal.s16       q12, \x6,  d1[1]
971
972    vadd.s32        q10, q14, q13
973    vsub.s32        q14, q14, q13
974
975.if \shift > 16
976    vrshr.s32       q10,  q10, #\shift
977    vrshr.s32       q14,  q14, #\shift
978    vmovn.s32       \y26, q10
979    vmovn.s32       \y29, q14
980.else
981    vrshrn.s32      \y26, q10, #\shift
982    vrshrn.s32      \y29, q14, #\shift
983.endif
984
985    vadd.s32        q10, q15, q12
986    vsub.s32        q15, q15, q12
987
988.if \shift > 16
989    vrshr.s32       q10,  q10, #\shift
990    vrshr.s32       q15,  q15, #\shift
991    vmovn.s32       \y27, q10
992    vmovn.s32       \y28, q15
993.else
994    vrshrn.s32      \y27, q10, #\shift
995    vrshrn.s32      \y28, q15, #\shift
996.endif
997
998.endm
999
1000asm_function jsimd_idct_4x4_neon
1001
1002    DCT_TABLE       .req r0
1003    COEF_BLOCK      .req r1
1004    OUTPUT_BUF      .req r2
1005    OUTPUT_COL      .req r3
1006    TMP1            .req r0
1007    TMP2            .req r1
1008    TMP3            .req r2
1009    TMP4            .req ip
1010
1011    vpush           {d8-d15}
1012
1013    /* Load constants (d3 is just used for padding) */
1014    adr             TMP4, jsimd_idct_4x4_neon_consts
1015    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
1016
1017    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1018     *       0 1 2 3 | 4 5 6 7
1019     *      ---------+--------
1020     *   0 | d4      | d5
1021     *   1 | d6      | d7
1022     *   2 | d8      | d9
1023     *   3 | d10     | d11
1024     *   4 | -       | -
1025     *   5 | d12     | d13
1026     *   6 | d14     | d15
1027     *   7 | d16     | d17
1028     */
1029    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1030    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1031    add COEF_BLOCK, COEF_BLOCK, #16
1032    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1033    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
1034    /* dequantize */
1035    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1036    vmul.s16        q2, q2, q9
1037    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1038    vmul.s16        q3, q3, q10
1039    vmul.s16        q4, q4, q11
1040    add             DCT_TABLE, DCT_TABLE, #16
1041    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1042    vmul.s16        q5, q5, q12
1043    vmul.s16        q6, q6, q13
1044    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
1045    vmul.s16        q7, q7, q14
1046    vmul.s16        q8, q8, q15
1047
1048    /* Pass 1 */
1049    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1050    transpose_4x4   d4, d6, d8, d10
1051    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1052    transpose_4x4   d5, d7, d9, d11
1053
1054    /* Pass 2 */
1055    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1056    transpose_4x4   d26, d27, d28, d29
1057
1058    /* Range limit */
1059    vmov.u16        q15, #0x80
1060    vadd.s16        q13, q13, q15
1061    vadd.s16        q14, q14, q15
1062    vqmovun.s16     d26, q13
1063    vqmovun.s16     d27, q14
1064
1065    /* Store results to the output buffer */
1066    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1067    add             TMP1, TMP1, OUTPUT_COL
1068    add             TMP2, TMP2, OUTPUT_COL
1069    add             TMP3, TMP3, OUTPUT_COL
1070    add             TMP4, TMP4, OUTPUT_COL
1071
1072#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1073    /* We can use much less instructions on little endian systems if the
1074     * OS kernel is not configured to trap unaligned memory accesses
1075     */
1076    vst1.32         {d26[0]}, [TMP1]!
1077    vst1.32         {d27[0]}, [TMP3]!
1078    vst1.32         {d26[1]}, [TMP2]!
1079    vst1.32         {d27[1]}, [TMP4]!
1080#else
1081    vst1.8          {d26[0]}, [TMP1]!
1082    vst1.8          {d27[0]}, [TMP3]!
1083    vst1.8          {d26[1]}, [TMP1]!
1084    vst1.8          {d27[1]}, [TMP3]!
1085    vst1.8          {d26[2]}, [TMP1]!
1086    vst1.8          {d27[2]}, [TMP3]!
1087    vst1.8          {d26[3]}, [TMP1]!
1088    vst1.8          {d27[3]}, [TMP3]!
1089
1090    vst1.8          {d26[4]}, [TMP2]!
1091    vst1.8          {d27[4]}, [TMP4]!
1092    vst1.8          {d26[5]}, [TMP2]!
1093    vst1.8          {d27[5]}, [TMP4]!
1094    vst1.8          {d26[6]}, [TMP2]!
1095    vst1.8          {d27[6]}, [TMP4]!
1096    vst1.8          {d26[7]}, [TMP2]!
1097    vst1.8          {d27[7]}, [TMP4]!
1098#endif
1099
1100    vpop            {d8-d15}
1101    bx              lr
1102
1103    .unreq          DCT_TABLE
1104    .unreq          COEF_BLOCK
1105    .unreq          OUTPUT_BUF
1106    .unreq          OUTPUT_COL
1107    .unreq          TMP1
1108    .unreq          TMP2
1109    .unreq          TMP3
1110    .unreq          TMP4
1111.endfunc
1112
1113.purgem idct_helper
1114
1115
1116/*****************************************************************************/
1117
1118/*
1119 * jsimd_idct_2x2_neon
1120 *
1121 * This function contains inverse-DCT code for getting reduced-size
1122 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1123 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1124 * function from jpeg-6b (jidctred.c).
1125 *
1126 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1127 *       requires much less arithmetic operations and hence should be faster.
1128 *       The primary purpose of this particular NEON optimized function is
1129 *       bit exact compatibility with jpeg-6b.
1130 */
1131
1132.balign 8
1133jsimd_idct_2x2_neon_consts:
1134    .short     -FIX_0_720959822    /* d0[0] */
1135    .short     FIX_0_850430095     /* d0[1] */
1136    .short     -FIX_1_272758580    /* d0[2] */
1137    .short     FIX_3_624509785     /* d0[3] */
1138
1139.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1140    vshll.s16  q14,  \x4,  #15
1141    vmull.s16  q13,  \x6,  d0[3]
1142    vmlal.s16  q13,  \x10, d0[2]
1143    vmlal.s16  q13,  \x12, d0[1]
1144    vmlal.s16  q13,  \x16, d0[0]
1145
1146    vadd.s32   q10,  q14,  q13
1147    vsub.s32   q14,  q14,  q13
1148
1149.if \shift > 16
1150    vrshr.s32  q10,  q10,  #\shift
1151    vrshr.s32  q14,  q14,  #\shift
1152    vmovn.s32  \y26, q10
1153    vmovn.s32  \y27, q14
1154.else
1155    vrshrn.s32 \y26, q10,  #\shift
1156    vrshrn.s32 \y27, q14,  #\shift
1157.endif
1158
1159.endm
1160
1161asm_function jsimd_idct_2x2_neon
1162
1163    DCT_TABLE       .req r0
1164    COEF_BLOCK      .req r1
1165    OUTPUT_BUF      .req r2
1166    OUTPUT_COL      .req r3
1167    TMP1            .req r0
1168    TMP2            .req ip
1169
1170    vpush           {d8-d15}
1171
1172    /* Load constants */
1173    adr             TMP2, jsimd_idct_2x2_neon_consts
1174    vld1.16         {d0}, [TMP2, :64]
1175
1176    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1177     *       0 1 2 3 | 4 5 6 7
1178     *      ---------+--------
1179     *   0 | d4      | d5
1180     *   1 | d6      | d7
1181     *   2 | -       | -
1182     *   3 | d10     | d11
1183     *   4 | -       | -
1184     *   5 | d12     | d13
1185     *   6 | -       | -
1186     *   7 | d16     | d17
1187     */
1188    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1189    add             COEF_BLOCK, COEF_BLOCK, #16
1190    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
1191    add             COEF_BLOCK, COEF_BLOCK, #16
1192    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
1193    add             COEF_BLOCK, COEF_BLOCK, #16
1194    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
1195    /* Dequantize */
1196    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1197    vmul.s16        q2, q2, q9
1198    vmul.s16        q3, q3, q10
1199    add             DCT_TABLE, DCT_TABLE, #16
1200    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
1201    vmul.s16        q5, q5, q12
1202    add             DCT_TABLE, DCT_TABLE, #16
1203    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
1204    vmul.s16        q6, q6, q13
1205    add             DCT_TABLE, DCT_TABLE, #16
1206    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
1207    vmul.s16        q8, q8, q15
1208
1209    /* Pass 1 */
1210#if 0
1211    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
1212    transpose_4x4   d4, d6, d8,  d10
1213    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
1214    transpose_4x4   d5, d7, d9,  d11
1215#else
1216    vmull.s16       q13, d6,  d0[3]
1217    vmlal.s16       q13, d10, d0[2]
1218    vmlal.s16       q13, d12, d0[1]
1219    vmlal.s16       q13, d16, d0[0]
1220    vmull.s16       q12, d7,  d0[3]
1221    vmlal.s16       q12, d11, d0[2]
1222    vmlal.s16       q12, d13, d0[1]
1223    vmlal.s16       q12, d17, d0[0]
1224    vshll.s16       q14, d4,  #15
1225    vshll.s16       q15, d5,  #15
1226    vadd.s32        q10, q14, q13
1227    vsub.s32        q14, q14, q13
1228    vrshrn.s32      d4,  q10, #13
1229    vrshrn.s32      d6,  q14, #13
1230    vadd.s32        q10, q15, q12
1231    vsub.s32        q14, q15, q12
1232    vrshrn.s32      d5,  q10, #13
1233    vrshrn.s32      d7,  q14, #13
1234    vtrn.16         q2,  q3
1235    vtrn.32         q3,  q5
1236#endif
1237
1238    /* Pass 2 */
1239    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
1240
1241    /* Range limit */
1242    vmov.u16        q15, #0x80
1243    vadd.s16        q13, q13, q15
1244    vqmovun.s16     d26, q13
1245    vqmovun.s16     d27, q13
1246
1247    /* Store results to the output buffer */
1248    ldmia           OUTPUT_BUF, {TMP1, TMP2}
1249    add             TMP1, TMP1, OUTPUT_COL
1250    add             TMP2, TMP2, OUTPUT_COL
1251
1252    vst1.8          {d26[0]}, [TMP1]!
1253    vst1.8          {d27[4]}, [TMP1]!
1254    vst1.8          {d26[1]}, [TMP2]!
1255    vst1.8          {d27[5]}, [TMP2]!
1256
1257    vpop            {d8-d15}
1258    bx              lr
1259
1260    .unreq          DCT_TABLE
1261    .unreq          COEF_BLOCK
1262    .unreq          OUTPUT_BUF
1263    .unreq          OUTPUT_COL
1264    .unreq          TMP1
1265    .unreq          TMP2
1266.endfunc
1267
1268.purgem idct_helper
1269
1270
1271/*****************************************************************************/
1272
1273/*
1274 * jsimd_ycc_extrgb_convert_neon
1275 * jsimd_ycc_extbgr_convert_neon
1276 * jsimd_ycc_extrgbx_convert_neon
1277 * jsimd_ycc_extbgrx_convert_neon
1278 * jsimd_ycc_extxbgr_convert_neon
1279 * jsimd_ycc_extxrgb_convert_neon
1280 *
1281 * Colorspace conversion YCbCr -> RGB
1282 */
1283
1284
1285.macro do_load size
1286    .if \size == 8
1287        vld1.8  {d4}, [U, :64]!
1288        vld1.8  {d5}, [V, :64]!
1289        vld1.8  {d0}, [Y, :64]!
1290        pld     [U, #64]
1291        pld     [V, #64]
1292        pld     [Y, #64]
1293    .elseif \size == 4
1294        vld1.8  {d4[0]}, [U]!
1295        vld1.8  {d4[1]}, [U]!
1296        vld1.8  {d4[2]}, [U]!
1297        vld1.8  {d4[3]}, [U]!
1298        vld1.8  {d5[0]}, [V]!
1299        vld1.8  {d5[1]}, [V]!
1300        vld1.8  {d5[2]}, [V]!
1301        vld1.8  {d5[3]}, [V]!
1302        vld1.8  {d0[0]}, [Y]!
1303        vld1.8  {d0[1]}, [Y]!
1304        vld1.8  {d0[2]}, [Y]!
1305        vld1.8  {d0[3]}, [Y]!
1306    .elseif \size == 2
1307        vld1.8  {d4[4]}, [U]!
1308        vld1.8  {d4[5]}, [U]!
1309        vld1.8  {d5[4]}, [V]!
1310        vld1.8  {d5[5]}, [V]!
1311        vld1.8  {d0[4]}, [Y]!
1312        vld1.8  {d0[5]}, [Y]!
1313    .elseif \size == 1
1314        vld1.8  {d4[6]}, [U]!
1315        vld1.8  {d5[6]}, [V]!
1316        vld1.8  {d0[6]}, [Y]!
1317    .else
1318        .error unsupported macroblock size
1319    .endif
1320.endm
1321
1322.macro do_store bpp, size
1323    .if \bpp == 24
1324        .if \size == 8
1325            vst3.8  {d10, d11, d12}, [RGB]!
1326        .elseif \size == 4
1327            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
1328            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
1329            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
1330            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
1331        .elseif \size == 2
1332            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
1333            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
1334        .elseif \size == 1
1335            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
1336        .else
1337            .error unsupported macroblock size
1338        .endif
1339    .elseif \bpp == 32
1340        .if \size == 8
1341            vst4.8  {d10, d11, d12, d13}, [RGB]!
1342        .elseif \size == 4
1343            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1344            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1345            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1346            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1347        .elseif \size == 2
1348            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1349            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1350        .elseif \size == 1
1351            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1352        .else
1353            .error unsupported macroblock size
1354        .endif
1355    .else
1356        .error unsupported bpp
1357    .endif
1358.endm
1359
1360.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1361
1362/*
1363 * 2 stage pipelined YCbCr->RGB conversion
1364 */
1365
1366.macro do_yuv_to_rgb_stage1
1367    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
1368    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
1369    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
1370    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
1371    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
1372    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
1373    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
1374    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
1375    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
1376    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
1377.endm
1378
1379.macro do_yuv_to_rgb_stage2
1380    vrshrn.s32      d20, q10, #15
1381    vrshrn.s32      d21, q11, #15
1382    vrshrn.s32      d24, q12, #14
1383    vrshrn.s32      d25, q13, #14
1384    vrshrn.s32      d28, q14, #14
1385    vrshrn.s32      d29, q15, #14
1386    vaddw.u8        q10, q10, d0
1387    vaddw.u8        q12, q12, d0
1388    vaddw.u8        q14, q14, d0
1389    vqmovun.s16     d1\g_offs, q10
1390    vqmovun.s16     d1\r_offs, q12
1391    vqmovun.s16     d1\b_offs, q14
1392.endm
1393
1394.macro do_yuv_to_rgb_stage2_store_load_stage1
1395    vld1.8          {d4}, [U, :64]!
1396      vrshrn.s32      d20, q10, #15
1397      vrshrn.s32      d21, q11, #15
1398      vrshrn.s32      d24, q12, #14
1399      vrshrn.s32      d25, q13, #14
1400      vrshrn.s32      d28, q14, #14
1401    vld1.8          {d5}, [V, :64]!
1402      vrshrn.s32      d29, q15, #14
1403      vaddw.u8        q10, q10, d0
1404      vaddw.u8        q12, q12, d0
1405      vaddw.u8        q14, q14, d0
1406      vqmovun.s16     d1\g_offs, q10
1407    vld1.8          {d0}, [Y, :64]!
1408      vqmovun.s16     d1\r_offs, q12
1409    pld             [U, #64]
1410    pld             [V, #64]
1411    pld             [Y, #64]
1412      vqmovun.s16     d1\b_offs, q14
1413    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
1414    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
1415      do_store        \bpp, 8
1416    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
1417    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
1418    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
1419    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
1420    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
1421    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
1422    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
1423    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
1424.endm
1425
1426.macro do_yuv_to_rgb
1427    do_yuv_to_rgb_stage1
1428    do_yuv_to_rgb_stage2
1429.endm
1430
1431/* Apple gas crashes on adrl, work around that by using adr.
1432 * But this requires a copy of these constants for each function.
1433 */
1434
1435.balign 16
1436jsimd_ycc_\colorid\()_neon_consts:
1437    .short          0,      0,     0,      0
1438    .short          22971, -11277, -23401, 29033
1439    .short          -128,  -128,   -128,   -128
1440    .short          -128,  -128,   -128,   -128
1441
1442asm_function jsimd_ycc_\colorid\()_convert_neon
1443    OUTPUT_WIDTH    .req r0
1444    INPUT_BUF       .req r1
1445    INPUT_ROW       .req r2
1446    OUTPUT_BUF      .req r3
1447    NUM_ROWS        .req r4
1448
1449    INPUT_BUF0      .req r5
1450    INPUT_BUF1      .req r6
1451    INPUT_BUF2      .req INPUT_BUF
1452
1453    RGB             .req r7
1454    Y               .req r8
1455    U               .req r9
1456    V               .req r10
1457    N               .req ip
1458
1459    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1460    adr             ip, jsimd_ycc_\colorid\()_neon_consts
1461    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1462
1463    /* Save ARM registers and handle input arguments */
1464    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1465    ldr             NUM_ROWS, [sp, #(4 * 8)]
1466    ldr             INPUT_BUF0, [INPUT_BUF]
1467    ldr             INPUT_BUF1, [INPUT_BUF, #4]
1468    ldr             INPUT_BUF2, [INPUT_BUF, #8]
1469    .unreq          INPUT_BUF
1470
1471    /* Save NEON registers */
1472    vpush           {d8-d15}
1473
1474    /* Initially set d10, d11, d12, d13 to 0xFF */
1475    vmov.u8         q5, #255
1476    vmov.u8         q6, #255
1477
1478    /* Outer loop over scanlines */
1479    cmp             NUM_ROWS, #1
1480    blt             9f
14810:
1482    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1483    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1484    mov             N, OUTPUT_WIDTH
1485    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1486    add             INPUT_ROW, INPUT_ROW, #1
1487    ldr             RGB, [OUTPUT_BUF], #4
1488
1489    /* Inner loop over pixels */
1490    subs            N, N, #8
1491    blt             3f
1492    do_load         8
1493    do_yuv_to_rgb_stage1
1494    subs            N, N, #8
1495    blt             2f
14961:
1497    do_yuv_to_rgb_stage2_store_load_stage1
1498    subs            N, N, #8
1499    bge             1b
15002:
1501    do_yuv_to_rgb_stage2
1502    do_store        \bpp, 8
1503    tst             N, #7
1504    beq             8f
15053:
1506    tst             N, #4
1507    beq             3f
1508    do_load         4
15093:
1510    tst             N, #2
1511    beq             4f
1512    do_load         2
15134:
1514    tst             N, #1
1515    beq             5f
1516    do_load         1
15175:
1518    do_yuv_to_rgb
1519    tst             N, #4
1520    beq             6f
1521    do_store        \bpp, 4
15226:
1523    tst             N, #2
1524    beq             7f
1525    do_store        \bpp, 2
15267:
1527    tst             N, #1
1528    beq             8f
1529    do_store        \bpp, 1
15308:
1531    subs            NUM_ROWS, NUM_ROWS, #1
1532    bgt             0b
15339:
1534    /* Restore all registers and return */
1535    vpop            {d8-d15}
1536    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1537
1538    .unreq          OUTPUT_WIDTH
1539    .unreq          INPUT_ROW
1540    .unreq          OUTPUT_BUF
1541    .unreq          NUM_ROWS
1542    .unreq          INPUT_BUF0
1543    .unreq          INPUT_BUF1
1544    .unreq          INPUT_BUF2
1545    .unreq          RGB
1546    .unreq          Y
1547    .unreq          U
1548    .unreq          V
1549    .unreq          N
1550.endfunc
1551
1552.purgem do_yuv_to_rgb
1553.purgem do_yuv_to_rgb_stage1
1554.purgem do_yuv_to_rgb_stage2
1555.purgem do_yuv_to_rgb_stage2_store_load_stage1
1556
1557.endm
1558
1559/*--------------------------------- id ----- bpp R  G  B */
1560generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
1561generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
1562generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1563generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1564generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1565generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1566
1567.purgem do_load
1568.purgem do_store
1569
1570
1571/*****************************************************************************/
1572
1573/*
1574 * jsimd_extrgb_ycc_convert_neon
1575 * jsimd_extbgr_ycc_convert_neon
1576 * jsimd_extrgbx_ycc_convert_neon
1577 * jsimd_extbgrx_ycc_convert_neon
1578 * jsimd_extxbgr_ycc_convert_neon
1579 * jsimd_extxrgb_ycc_convert_neon
1580 *
1581 * Colorspace conversion RGB -> YCbCr
1582 */
1583
1584.macro do_store size
1585    .if \size == 8
1586        vst1.8  {d20}, [Y]!
1587        vst1.8  {d21}, [U]!
1588        vst1.8  {d22}, [V]!
1589    .elseif \size == 4
1590        vst1.8  {d20[0]}, [Y]!
1591        vst1.8  {d20[1]}, [Y]!
1592        vst1.8  {d20[2]}, [Y]!
1593        vst1.8  {d20[3]}, [Y]!
1594        vst1.8  {d21[0]}, [U]!
1595        vst1.8  {d21[1]}, [U]!
1596        vst1.8  {d21[2]}, [U]!
1597        vst1.8  {d21[3]}, [U]!
1598        vst1.8  {d22[0]}, [V]!
1599        vst1.8  {d22[1]}, [V]!
1600        vst1.8  {d22[2]}, [V]!
1601        vst1.8  {d22[3]}, [V]!
1602    .elseif \size == 2
1603        vst1.8  {d20[4]}, [Y]!
1604        vst1.8  {d20[5]}, [Y]!
1605        vst1.8  {d21[4]}, [U]!
1606        vst1.8  {d21[5]}, [U]!
1607        vst1.8  {d22[4]}, [V]!
1608        vst1.8  {d22[5]}, [V]!
1609    .elseif \size == 1
1610        vst1.8  {d20[6]}, [Y]!
1611        vst1.8  {d21[6]}, [U]!
1612        vst1.8  {d22[6]}, [V]!
1613    .else
1614        .error unsupported macroblock size
1615    .endif
1616.endm
1617
1618.macro do_load bpp, size
1619    .if \bpp == 24
1620        .if \size == 8
1621            vld3.8  {d10, d11, d12}, [RGB]!
1622            pld     [RGB, #128]
1623        .elseif \size == 4
1624            vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
1625            vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
1626            vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
1627            vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
1628        .elseif \size == 2
1629            vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
1630            vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
1631        .elseif \size == 1
1632            vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
1633        .else
1634            .error unsupported macroblock size
1635        .endif
1636    .elseif \bpp == 32
1637        .if \size == 8
1638            vld4.8  {d10, d11, d12, d13}, [RGB]!
1639            pld     [RGB, #128]
1640        .elseif \size == 4
1641            vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1642            vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1643            vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1644            vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1645        .elseif \size == 2
1646            vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1647            vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1648        .elseif \size == 1
1649            vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1650        .else
1651            .error unsupported macroblock size
1652        .endif
1653    .else
1654        .error unsupported bpp
1655    .endif
1656.endm
1657
1658.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1659
1660/*
1661 * 2 stage pipelined RGB->YCbCr conversion
1662 */
1663
1664.macro do_rgb_to_yuv_stage1
1665    vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
1666    vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
1667    vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
1668    vmull.u16   q7, d4, d0[0]
1669    vmlal.u16   q7, d6, d0[1]
1670    vmlal.u16   q7, d8, d0[2]
1671    vmull.u16   q8, d5, d0[0]
1672    vmlal.u16   q8, d7, d0[1]
1673    vmlal.u16   q8, d9, d0[2]
1674    vrev64.32   q9,  q1
1675    vrev64.32   q13, q1
1676    vmlsl.u16   q9,  d4, d0[3]
1677    vmlsl.u16   q9,  d6, d1[0]
1678    vmlal.u16   q9,  d8, d1[1]
1679    vmlsl.u16   q13, d5, d0[3]
1680    vmlsl.u16   q13, d7, d1[0]
1681    vmlal.u16   q13, d9, d1[1]
1682    vrev64.32   q14, q1
1683    vrev64.32   q15, q1
1684    vmlal.u16   q14, d4, d1[1]
1685    vmlsl.u16   q14, d6, d1[2]
1686    vmlsl.u16   q14, d8, d1[3]
1687    vmlal.u16   q15, d5, d1[1]
1688    vmlsl.u16   q15, d7, d1[2]
1689    vmlsl.u16   q15, d9, d1[3]
1690.endm
1691
1692.macro do_rgb_to_yuv_stage2
1693    vrshrn.u32  d20, q7,  #16
1694    vrshrn.u32  d21, q8,  #16
1695    vshrn.u32   d22, q9,  #16
1696    vshrn.u32   d23, q13, #16
1697    vshrn.u32   d24, q14, #16
1698    vshrn.u32   d25, q15, #16
1699    vmovn.u16   d20, q10      /* d20 = y */
1700    vmovn.u16   d21, q11      /* d21 = u */
1701    vmovn.u16   d22, q12      /* d22 = v */
1702.endm
1703
1704.macro do_rgb_to_yuv
1705    do_rgb_to_yuv_stage1
1706    do_rgb_to_yuv_stage2
1707.endm
1708
1709.macro do_rgb_to_yuv_stage2_store_load_stage1
1710      vrshrn.u32  d20, q7,  #16
1711      vrshrn.u32  d21, q8,  #16
1712      vshrn.u32   d22, q9,  #16
1713    vrev64.32   q9,  q1
1714      vshrn.u32   d23, q13, #16
1715    vrev64.32   q13, q1
1716      vshrn.u32   d24, q14, #16
1717      vshrn.u32   d25, q15, #16
1718    do_load     \bpp, 8
1719      vmovn.u16   d20, q10      /* d20 = y */
1720    vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
1721      vmovn.u16   d21, q11      /* d21 = u */
1722    vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
1723      vmovn.u16   d22, q12      /* d22 = v */
1724    vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
1725    vmull.u16   q7, d4, d0[0]
1726    vmlal.u16   q7, d6, d0[1]
1727    vmlal.u16   q7, d8, d0[2]
1728      vst1.8      {d20}, [Y]!
1729    vmull.u16   q8, d5, d0[0]
1730    vmlal.u16   q8, d7, d0[1]
1731    vmlal.u16   q8, d9, d0[2]
1732    vmlsl.u16   q9,  d4, d0[3]
1733    vmlsl.u16   q9,  d6, d1[0]
1734    vmlal.u16   q9,  d8, d1[1]
1735      vst1.8      {d21}, [U]!
1736    vmlsl.u16   q13, d5, d0[3]
1737    vmlsl.u16   q13, d7, d1[0]
1738    vmlal.u16   q13, d9, d1[1]
1739    vrev64.32   q14, q1
1740    vrev64.32   q15, q1
1741    vmlal.u16   q14, d4, d1[1]
1742    vmlsl.u16   q14, d6, d1[2]
1743    vmlsl.u16   q14, d8, d1[3]
1744      vst1.8      {d22}, [V]!
1745    vmlal.u16   q15, d5, d1[1]
1746    vmlsl.u16   q15, d7, d1[2]
1747    vmlsl.u16   q15, d9, d1[3]
1748.endm
1749
1750.balign 16
1751jsimd_\colorid\()_ycc_neon_consts:
1752    .short          19595, 38470, 7471,  11059
1753    .short          21709, 32768, 27439, 5329
1754    .short          32767, 128,   32767, 128
1755    .short          32767, 128,   32767, 128
1756
1757asm_function jsimd_\colorid\()_ycc_convert_neon
1758    OUTPUT_WIDTH    .req r0
1759    INPUT_BUF       .req r1
1760    OUTPUT_BUF      .req r2
1761    OUTPUT_ROW      .req r3
1762    NUM_ROWS        .req r4
1763
1764    OUTPUT_BUF0     .req r5
1765    OUTPUT_BUF1     .req r6
1766    OUTPUT_BUF2     .req OUTPUT_BUF
1767
1768    RGB             .req r7
1769    Y               .req r8
1770    U               .req r9
1771    V               .req r10
1772    N               .req ip
1773
1774    /* Load constants to d0, d1, d2, d3 */
1775    adr             ip, jsimd_\colorid\()_ycc_neon_consts
1776    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1777
1778    /* Save ARM registers and handle input arguments */
1779    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1780    ldr             NUM_ROWS, [sp, #(4 * 8)]
1781    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
1782    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
1783    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
1784    .unreq          OUTPUT_BUF
1785
1786    /* Save NEON registers */
1787    vpush           {d8-d15}
1788
1789    /* Outer loop over scanlines */
1790    cmp             NUM_ROWS, #1
1791    blt             9f
17920:
1793    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1794    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1795    mov             N, OUTPUT_WIDTH
1796    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1797    add             OUTPUT_ROW, OUTPUT_ROW, #1
1798    ldr             RGB, [INPUT_BUF], #4
1799
1800    /* Inner loop over pixels */
1801    subs            N, N, #8
1802    blt             3f
1803    do_load         \bpp, 8
1804    do_rgb_to_yuv_stage1
1805    subs            N, N, #8
1806    blt             2f
18071:
1808    do_rgb_to_yuv_stage2_store_load_stage1
1809    subs            N, N, #8
1810    bge             1b
18112:
1812    do_rgb_to_yuv_stage2
1813    do_store        8
1814    tst             N, #7
1815    beq             8f
18163:
1817    tst             N, #4
1818    beq             3f
1819    do_load         \bpp, 4
18203:
1821    tst             N, #2
1822    beq             4f
1823    do_load         \bpp, 2
18244:
1825    tst             N, #1
1826    beq             5f
1827    do_load         \bpp, 1
18285:
1829    do_rgb_to_yuv
1830    tst             N, #4
1831    beq             6f
1832    do_store        4
18336:
1834    tst             N, #2
1835    beq             7f
1836    do_store        2
18377:
1838    tst             N, #1
1839    beq             8f
1840    do_store        1
18418:
1842    subs            NUM_ROWS, NUM_ROWS, #1
1843    bgt             0b
18449:
1845    /* Restore all registers and return */
1846    vpop            {d8-d15}
1847    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1848
1849    .unreq          OUTPUT_WIDTH
1850    .unreq          OUTPUT_ROW
1851    .unreq          INPUT_BUF
1852    .unreq          NUM_ROWS
1853    .unreq          OUTPUT_BUF0
1854    .unreq          OUTPUT_BUF1
1855    .unreq          OUTPUT_BUF2
1856    .unreq          RGB
1857    .unreq          Y
1858    .unreq          U
1859    .unreq          V
1860    .unreq          N
1861.endfunc
1862
1863.purgem do_rgb_to_yuv
1864.purgem do_rgb_to_yuv_stage1
1865.purgem do_rgb_to_yuv_stage2
1866.purgem do_rgb_to_yuv_stage2_store_load_stage1
1867
1868.endm
1869
1870/*--------------------------------- id ----- bpp R  G  B */
1871generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
1872generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
1873generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1874generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1875generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1876generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1877
1878.purgem do_load
1879.purgem do_store
1880
1881
1882/*****************************************************************************/
1883
1884/*
1885 * Load data into workspace, applying unsigned->signed conversion
1886 *
1887 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1888 *       rid of VST1.16 instructions
1889 */
1890
1891asm_function jsimd_convsamp_neon
1892    SAMPLE_DATA     .req r0
1893    START_COL       .req r1
1894    WORKSPACE       .req r2
1895    TMP1            .req r3
1896    TMP2            .req r4
1897    TMP3            .req r5
1898    TMP4            .req ip
1899
1900    push            {r4, r5}
1901    vmov.u8         d0, #128
1902
1903    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1904    add             TMP1, TMP1, START_COL
1905    add             TMP2, TMP2, START_COL
1906    add             TMP3, TMP3, START_COL
1907    add             TMP4, TMP4, START_COL
1908    vld1.8          {d16}, [TMP1]
1909    vsubl.u8        q8, d16, d0
1910    vld1.8          {d18}, [TMP2]
1911    vsubl.u8        q9, d18, d0
1912    vld1.8          {d20}, [TMP3]
1913    vsubl.u8        q10, d20, d0
1914    vld1.8          {d22}, [TMP4]
1915    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1916    vsubl.u8        q11, d22, d0
1917    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
1918    add             TMP1, TMP1, START_COL
1919    add             TMP2, TMP2, START_COL
1920    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
1921    add             TMP3, TMP3, START_COL
1922    add             TMP4, TMP4, START_COL
1923    vld1.8          {d24}, [TMP1]
1924    vsubl.u8        q12, d24, d0
1925    vld1.8          {d26}, [TMP2]
1926    vsubl.u8        q13, d26, d0
1927    vld1.8          {d28}, [TMP3]
1928    vsubl.u8        q14, d28, d0
1929    vld1.8          {d30}, [TMP4]
1930    vsubl.u8        q15, d30, d0
1931    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
1932    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
1933    pop             {r4, r5}
1934    bx              lr
1935
1936    .unreq          SAMPLE_DATA
1937    .unreq          START_COL
1938    .unreq          WORKSPACE
1939    .unreq          TMP1
1940    .unreq          TMP2
1941    .unreq          TMP3
1942    .unreq          TMP4
1943.endfunc
1944
1945
1946/*****************************************************************************/
1947
1948/*
1949 * jsimd_fdct_ifast_neon
1950 *
1951 * This function contains a fast, not so accurate integer implementation of
1952 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1953 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1954 * function from jfdctfst.c
1955 *
1956 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1957 *       rid of a bunch of VLD1.16 instructions
1958 */
1959
1960#define XFIX_0_382683433 d0[0]
1961#define XFIX_0_541196100 d0[1]
1962#define XFIX_0_707106781 d0[2]
1963#define XFIX_1_306562965 d0[3]
1964
1965.balign 16
1966jsimd_fdct_ifast_neon_consts:
1967    .short (98 * 128)              /* XFIX_0_382683433 */
1968    .short (139 * 128)             /* XFIX_0_541196100 */
1969    .short (181 * 128)             /* XFIX_0_707106781 */
1970    .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
1971
1972asm_function jsimd_fdct_ifast_neon
1973
1974    DATA            .req r0
1975    TMP             .req ip
1976
1977    vpush           {d8-d15}
1978
1979    /* Load constants */
1980    adr             TMP, jsimd_fdct_ifast_neon_consts
1981    vld1.16         {d0}, [TMP, :64]
1982
1983    /* Load all DATA into NEON registers with the following allocation:
1984     *       0 1 2 3 | 4 5 6 7
1985     *      ---------+--------
1986     *   0 | d16     | d17    | q8
1987     *   1 | d18     | d19    | q9
1988     *   2 | d20     | d21    | q10
1989     *   3 | d22     | d23    | q11
1990     *   4 | d24     | d25    | q12
1991     *   5 | d26     | d27    | q13
1992     *   6 | d28     | d29    | q14
1993     *   7 | d30     | d31    | q15
1994     */
1995
1996    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
1997    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
1998    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
1999    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
2000    sub             DATA, DATA, #(128 - 32)
2001
2002    mov             TMP, #2
20031:
2004    /* Transpose */
2005    vtrn.16         q12, q13
2006    vtrn.16         q10, q11
2007    vtrn.16         q8,  q9
2008    vtrn.16         q14, q15
2009    vtrn.32         q9,  q11
2010    vtrn.32         q13, q15
2011    vtrn.32         q8,  q10
2012    vtrn.32         q12, q14
2013    vswp            d30, d23
2014    vswp            d24, d17
2015    vswp            d26, d19
2016      /* 1-D FDCT */
2017      vadd.s16        q2,  q11, q12
2018    vswp            d28, d21
2019      vsub.s16        q12, q11, q12
2020      vsub.s16        q6,  q10, q13
2021      vadd.s16        q10, q10, q13
2022      vsub.s16        q7,  q9,  q14
2023      vadd.s16        q9,  q9,  q14
2024      vsub.s16        q1,  q8,  q15
2025      vadd.s16        q8,  q8,  q15
2026      vsub.s16        q4,  q9,  q10
2027      vsub.s16        q5,  q8,  q2
2028      vadd.s16        q3,  q9,  q10
2029      vadd.s16        q4,  q4,  q5
2030      vadd.s16        q2,  q8,  q2
2031      vqdmulh.s16     q4,  q4,  XFIX_0_707106781
2032      vadd.s16        q11, q12, q6
2033      vadd.s16        q8,  q2,  q3
2034      vsub.s16        q12, q2,  q3
2035      vadd.s16        q3,  q6,  q7
2036      vadd.s16        q7,  q7,  q1
2037      vqdmulh.s16     q3,  q3,  XFIX_0_707106781
2038      vsub.s16        q6,  q11, q7
2039      vadd.s16        q10, q5,  q4
2040      vqdmulh.s16     q6,  q6,  XFIX_0_382683433
2041      vsub.s16        q14, q5,  q4
2042      vqdmulh.s16     q11, q11, XFIX_0_541196100
2043      vqdmulh.s16     q5,  q7,  XFIX_1_306562965
2044      vadd.s16        q4,  q1,  q3
2045      vsub.s16        q3,  q1,  q3
2046      vadd.s16        q7,  q7,  q6
2047      vadd.s16        q11, q11, q6
2048      vadd.s16        q7,  q7,  q5
2049      vadd.s16        q13, q3,  q11
2050      vsub.s16        q11, q3,  q11
2051      vadd.s16        q9,  q4,  q7
2052      vsub.s16        q15, q4,  q7
2053    subs            TMP, TMP, #1
2054    bne             1b
2055
2056    /* store results */
2057    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
2058    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
2059    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
2060    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
2061
2062    vpop            {d8-d15}
2063    bx              lr
2064
2065    .unreq          DATA
2066    .unreq          TMP
2067.endfunc
2068
2069
2070/*****************************************************************************/
2071
2072/*
2073 * GLOBAL(void)
2074 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
2075 *                      DCTELEM * workspace);
2076 *
2077 * Note: the code uses 2 stage pipelining in order to improve instructions
2078 *       scheduling and eliminate stalls (this provides ~15% better
2079 *       performance for this function on both ARM Cortex-A8 and
2080 *       ARM Cortex-A9 when compared to the non-pipelined variant).
2081 *       The instructions which belong to the second stage use different
2082 *       indentation for better readiability.
2083 */
2084asm_function jsimd_quantize_neon
2085
2086    COEF_BLOCK      .req r0
2087    DIVISORS        .req r1
2088    WORKSPACE       .req r2
2089
2090    RECIPROCAL      .req DIVISORS
2091    CORRECTION      .req r3
2092    SHIFT           .req ip
2093    LOOP_COUNT      .req r4
2094
2095    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
2096    vabs.s16        q12, q0
2097    add             CORRECTION, DIVISORS, #(64 * 2)
2098    add             SHIFT, DIVISORS, #(64 * 6)
2099    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
2100    vabs.s16        q13, q1
2101    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2102    vadd.u16        q12, q12, q10 /* add correction */
2103    vadd.u16        q13, q13, q11
2104    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
2105    vmull.u16       q11, d25, d17
2106    vmull.u16       q8,  d26, d18
2107    vmull.u16       q9,  d27, d19
2108    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
2109    vshrn.u32       d20, q10, #16
2110    vshrn.u32       d21, q11, #16
2111    vshrn.u32       d22, q8,  #16
2112    vshrn.u32       d23, q9,  #16
2113    vneg.s16        q12, q12
2114    vneg.s16        q13, q13
2115    vshr.s16        q2,  q0,  #15 /* extract sign */
2116    vshr.s16        q3,  q1,  #15
2117    vshl.u16        q14, q10, q12 /* shift */
2118    vshl.u16        q15, q11, q13
2119
2120    push            {r4, r5}
2121    mov             LOOP_COUNT, #3
21221:
2123    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
2124      veor.u16        q14, q14, q2  /* restore sign */
2125    vabs.s16        q12, q0
2126    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
2127    vabs.s16        q13, q1
2128      veor.u16        q15, q15, q3
2129    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2130    vadd.u16        q12, q12, q10 /* add correction */
2131    vadd.u16        q13, q13, q11
2132    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
2133    vmull.u16       q11, d25, d17
2134    vmull.u16       q8,  d26, d18
2135    vmull.u16       q9,  d27, d19
2136      vsub.u16        q14, q14, q2
2137    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
2138      vsub.u16        q15, q15, q3
2139    vshrn.u32       d20, q10, #16
2140    vshrn.u32       d21, q11, #16
2141      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2142    vshrn.u32       d22, q8,  #16
2143    vshrn.u32       d23, q9,  #16
2144    vneg.s16        q12, q12
2145    vneg.s16        q13, q13
2146    vshr.s16        q2,  q0,  #15 /* extract sign */
2147    vshr.s16        q3,  q1,  #15
2148    vshl.u16        q14, q10, q12 /* shift */
2149    vshl.u16        q15, q11, q13
2150    subs            LOOP_COUNT, LOOP_COUNT, #1
2151    bne             1b
2152    pop             {r4, r5}
2153
2154      veor.u16        q14, q14, q2  /* restore sign */
2155      veor.u16        q15, q15, q3
2156      vsub.u16        q14, q14, q2
2157      vsub.u16        q15, q15, q3
2158      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2159
2160    bx              lr /* return */
2161
2162    .unreq          COEF_BLOCK
2163    .unreq          DIVISORS
2164    .unreq          WORKSPACE
2165    .unreq          RECIPROCAL
2166    .unreq          CORRECTION
2167    .unreq          SHIFT
2168    .unreq          LOOP_COUNT
2169.endfunc
2170
2171
2172/*****************************************************************************/
2173
2174/*
2175 * GLOBAL(void)
2176 * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
2177 *                                 JDIMENSION   downsampled_width,
2178 *                                 JSAMPARRAY   input_data,
2179 *                                 JSAMPARRAY * output_data_ptr);
2180 *
2181 * Note: the use of unaligned writes is the main remaining bottleneck in
2182 *       this code, which can be potentially solved to get up to tens
2183 *       of percents performance improvement on Cortex-A8/Cortex-A9.
2184 */
2185
2186/*
2187 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2188 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2189 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2190 * Register d28 is used for multiplication by 3. Register q15 is used
2191 * for adding +1 bias.
2192 */
2193.macro upsample16   OUTPTR, INPTR
2194    vld1.8          {q0}, [\INPTR]!
2195    vmovl.u8        q8,  d0
2196    vext.8          q2,  q1,  q0, #15
2197    vmovl.u8        q9,  d1
2198    vaddw.u8        q10, q15, d4
2199    vaddw.u8        q11, q15, d5
2200    vmlal.u8        q8,  d4,  d28
2201    vmlal.u8        q9,  d5,  d28
2202    vmlal.u8        q10, d0,  d28
2203    vmlal.u8        q11, d1,  d28
2204    vmov            q1,  q0       /* backup source pixels to q1 */
2205    vrshrn.u16      d6,  q8,  #2
2206    vrshrn.u16      d7,  q9,  #2
2207    vshrn.u16       d8,  q10, #2
2208    vshrn.u16       d9,  q11, #2
2209    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2210.endm
2211
2212/*
2213 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2214 * macro, the roles of q0 and q1 registers are reversed for even and odd
2215 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2216 * Also this unrolling allows to reorder loads and stores to compensate
2217 * multiplication latency and reduce stalls.
2218 */
2219.macro upsample32   OUTPTR, INPTR
2220    /* even 16 pixels group */
2221    vld1.8          {q0}, [\INPTR]!
2222    vmovl.u8        q8,  d0
2223    vext.8          q2,  q1,  q0, #15
2224    vmovl.u8        q9,  d1
2225    vaddw.u8        q10, q15, d4
2226    vaddw.u8        q11, q15, d5
2227    vmlal.u8        q8,  d4,  d28
2228    vmlal.u8        q9,  d5,  d28
2229    vmlal.u8        q10, d0,  d28
2230    vmlal.u8        q11, d1,  d28
2231        /* odd 16 pixels group */
2232        vld1.8          {q1}, [\INPTR]!
2233    vrshrn.u16      d6,  q8,  #2
2234    vrshrn.u16      d7,  q9,  #2
2235    vshrn.u16       d8,  q10, #2
2236    vshrn.u16       d9,  q11, #2
2237        vmovl.u8        q8,  d2
2238        vext.8          q2,  q0,  q1, #15
2239        vmovl.u8        q9,  d3
2240        vaddw.u8        q10, q15, d4
2241        vaddw.u8        q11, q15, d5
2242        vmlal.u8        q8,  d4,  d28
2243        vmlal.u8        q9,  d5,  d28
2244        vmlal.u8        q10, d2,  d28
2245        vmlal.u8        q11, d3,  d28
2246    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2247        vrshrn.u16      d6,  q8,  #2
2248        vrshrn.u16      d7,  q9,  #2
2249        vshrn.u16       d8,  q10, #2
2250        vshrn.u16       d9,  q11, #2
2251        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2252.endm
2253
2254/*
2255 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2256 */
2257.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2258    /* special case for the first and last pixels */
2259    sub             \WIDTH, \WIDTH, #1
2260    add             \OUTPTR, \OUTPTR, #1
2261    ldrb            \TMP1, [\INPTR, \WIDTH]
2262    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
2263    ldrb            \TMP1, [\INPTR], #1
2264    strb            \TMP1, [\OUTPTR, #-1]
2265    vmov.8          d3[7], \TMP1
2266
2267    subs            \WIDTH, \WIDTH, #32
2268    blt             5f
22690:  /* process 32 pixels per iteration */
2270    upsample32      \OUTPTR, \INPTR
2271    subs            \WIDTH, \WIDTH, #32
2272    bge             0b
22735:
2274    adds            \WIDTH, \WIDTH, #16
2275    blt             1f
22760:  /* process 16 pixels if needed */
2277    upsample16      \OUTPTR, \INPTR
2278    subs            \WIDTH, \WIDTH, #16
22791:
2280    adds            \WIDTH, \WIDTH, #16
2281    beq             9f
2282
2283    /* load the remaining 1-15 pixels */
2284    add             \INPTR, \INPTR, \WIDTH
2285    tst             \WIDTH, #1
2286    beq             2f
2287    sub             \INPTR, \INPTR, #1
2288    vld1.8          {d0[0]}, [\INPTR]
22892:
2290    tst             \WIDTH, #2
2291    beq             2f
2292    vext.8          d0, d0, d0, #6
2293    sub             \INPTR, \INPTR, #1
2294    vld1.8          {d0[1]}, [\INPTR]
2295    sub             \INPTR, \INPTR, #1
2296    vld1.8          {d0[0]}, [\INPTR]
22972:
2298    tst             \WIDTH, #4
2299    beq             2f
2300    vrev64.32       d0, d0
2301    sub             \INPTR, \INPTR, #1
2302    vld1.8          {d0[3]}, [\INPTR]
2303    sub             \INPTR, \INPTR, #1
2304    vld1.8          {d0[2]}, [\INPTR]
2305    sub             \INPTR, \INPTR, #1
2306    vld1.8          {d0[1]}, [\INPTR]
2307    sub             \INPTR, \INPTR, #1
2308    vld1.8          {d0[0]}, [\INPTR]
23092:
2310    tst             \WIDTH, #8
2311    beq             2f
2312    vmov            d1,  d0
2313    sub             \INPTR, \INPTR, #8
2314    vld1.8          {d0}, [\INPTR]
23152:  /* upsample the remaining pixels */
2316    vmovl.u8        q8,  d0
2317    vext.8          q2,  q1,  q0, #15
2318    vmovl.u8        q9,  d1
2319    vaddw.u8        q10, q15, d4
2320    vaddw.u8        q11, q15, d5
2321    vmlal.u8        q8,  d4,  d28
2322    vmlal.u8        q9,  d5,  d28
2323    vmlal.u8        q10, d0,  d28
2324    vmlal.u8        q11, d1,  d28
2325    vrshrn.u16      d10, q8,  #2
2326    vrshrn.u16      d12, q9,  #2
2327    vshrn.u16       d11, q10, #2
2328    vshrn.u16       d13, q11, #2
2329    vzip.8          d10, d11
2330    vzip.8          d12, d13
2331    /* store the remaining pixels */
2332    tst             \WIDTH, #8
2333    beq             2f
2334    vst1.8          {d10, d11}, [\OUTPTR]!
2335    vmov            q5,  q6
23362:
2337    tst             \WIDTH, #4
2338    beq             2f
2339    vst1.8          {d10}, [\OUTPTR]!
2340    vmov            d10,  d11
23412:
2342    tst             \WIDTH, #2
2343    beq             2f
2344    vst1.8          {d10[0]}, [\OUTPTR]!
2345    vst1.8          {d10[1]}, [\OUTPTR]!
2346    vst1.8          {d10[2]}, [\OUTPTR]!
2347    vst1.8          {d10[3]}, [\OUTPTR]!
2348    vext.8          d10, d10, d10, #4
23492:
2350    tst             \WIDTH, #1
2351    beq             2f
2352    vst1.8          {d10[0]}, [\OUTPTR]!
2353    vst1.8          {d10[1]}, [\OUTPTR]!
23542:
23559:
2356.endm
2357
2358asm_function jsimd_h2v1_fancy_upsample_neon
2359
2360    MAX_V_SAMP_FACTOR .req r0
2361    DOWNSAMPLED_WIDTH .req r1
2362    INPUT_DATA        .req r2
2363    OUTPUT_DATA_PTR   .req r3
2364    OUTPUT_DATA       .req OUTPUT_DATA_PTR
2365
2366    OUTPTR            .req r4
2367    INPTR             .req r5
2368    WIDTH             .req ip
2369    TMP               .req lr
2370
2371    push            {r4, r5, r6, lr}
2372    vpush           {d8-d15}
2373
2374    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
2375    cmp             MAX_V_SAMP_FACTOR, #0
2376    ble             99f
2377
2378    /* initialize constants */
2379    vmov.u8         d28, #3
2380    vmov.u16        q15, #1
238111:
2382    ldr             INPTR, [INPUT_DATA], #4
2383    ldr             OUTPTR, [OUTPUT_DATA], #4
2384    mov             WIDTH, DOWNSAMPLED_WIDTH
2385    upsample_row    OUTPTR, INPTR, WIDTH, TMP
2386    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2387    bgt             11b
2388
238999:
2390    vpop            {d8-d15}
2391    pop             {r4, r5, r6, pc}
2392
2393    .unreq          MAX_V_SAMP_FACTOR
2394    .unreq          DOWNSAMPLED_WIDTH
2395    .unreq          INPUT_DATA
2396    .unreq          OUTPUT_DATA_PTR
2397    .unreq          OUTPUT_DATA
2398
2399    .unreq          OUTPTR
2400    .unreq          INPTR
2401    .unreq          WIDTH
2402    .unreq          TMP
2403
2404.endfunc
2405
2406.purgem upsample16
2407.purgem upsample32
2408.purgem upsample_row
2409