1/*
2 * ARMv7 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty.  In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 *    claim that you wrote the original software. If you use this software
18 *    in a product, an acknowledgment in the product documentation would be
19 *    appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 *    misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25#if defined(__linux__) && defined(__ELF__)
26.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
27#endif
28
29.text
30.fpu neon
31.arch armv7a
32.object_arch armv4
33.arm
34
35
36#define RESPECT_STRICT_ALIGNMENT 1
37
38
39/*****************************************************************************/
40
41/* Supplementary macro for setting function attributes */
42.macro asm_function fname
43#ifdef __APPLE__
44    .globl _\fname
45_\fname:
46#else
47    .global \fname
48#ifdef __ELF__
49    .hidden \fname
50    .type \fname, %function
51#endif
52\fname:
53#endif
54.endm
55
56/* Transpose a block of 4x4 coefficients in four 64-bit registers */
57.macro transpose_4x4 x0, x1, x2, x3
58    vtrn.16 \x0, \x1
59    vtrn.16 \x2, \x3
60    vtrn.32 \x0, \x2
61    vtrn.32 \x1, \x3
62.endm
63
64
65#define CENTERJSAMPLE 128
66
67/*****************************************************************************/
68
69/*
70 * Perform dequantization and inverse DCT on one block of coefficients.
71 *
72 * GLOBAL(void)
73 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
74 *                        JSAMPARRAY output_buf, JDIMENSION output_col)
75 */
76
77#define FIX_0_298631336  (2446)
78#define FIX_0_390180644  (3196)
79#define FIX_0_541196100  (4433)
80#define FIX_0_765366865  (6270)
81#define FIX_0_899976223  (7373)
82#define FIX_1_175875602  (9633)
83#define FIX_1_501321110  (12299)
84#define FIX_1_847759065  (15137)
85#define FIX_1_961570560  (16069)
86#define FIX_2_053119869  (16819)
87#define FIX_2_562915447  (20995)
88#define FIX_3_072711026  (25172)
89
90#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
91#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
92#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
93#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
94#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
95#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
96#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
97#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
98
99/*
100 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
101 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
102 */
103#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
104{                                                                             \
105    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
106    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
107    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
108                                                                              \
109    /* 1-D iDCT input data */                                                 \
110    row0 = xrow0;                                                             \
111    row1 = xrow1;                                                             \
112    row2 = xrow2;                                                             \
113    row3 = xrow3;                                                             \
114    row4 = xrow4;                                                             \
115    row5 = xrow5;                                                             \
116    row6 = xrow6;                                                             \
117    row7 = xrow7;                                                             \
118                                                                              \
119    q5 = row7 + row3;                                                         \
120    q4 = row5 + row1;                                                         \
121    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
122         MULTIPLY(q4, FIX_1_175875602);                                       \
123    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
124         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
125    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
126         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
127    q4 = q6;                                                                  \
128    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
129    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
130          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
131    /* now we can use q1 (reloadable constants have been used up) */          \
132    q1 = q3 + q2;                                                             \
133    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
134          MULTIPLY(row1, -FIX_0_899976223);                                   \
135    q5 = q7;                                                                  \
136    q1 = q1 + q6;                                                             \
137    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
138          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
139                                                                              \
140    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
141    tmp11_plus_tmp2 = q1;                                                     \
142    row1 = 0;                                                                 \
143                                                                              \
144    q1 = q1 - q6;                                                             \
145    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
146          MULTIPLY(row3, -FIX_2_562915447);                                   \
147    q1 = q1 - q6;                                                             \
148    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
149         MULTIPLY(row6, FIX_0_541196100);                                     \
150    q3 = q3 - q2;                                                             \
151                                                                              \
152    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
153    tmp11_minus_tmp2 = q1;                                                    \
154                                                                              \
155    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
156    q2 = q1 + q6;                                                             \
157    q1 = q1 - q6;                                                             \
158                                                                              \
159    /* pick up the results */                                                 \
160    tmp0  = q4;                                                               \
161    tmp1  = q5;                                                               \
162    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
163    tmp3  = q7;                                                               \
164    tmp10 = q2;                                                               \
165    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
166    tmp12 = q3;                                                               \
167    tmp13 = q1;                                                               \
168}
169
170#define XFIX_0_899976223                    d0[0]
171#define XFIX_0_541196100                    d0[1]
172#define XFIX_2_562915447                    d0[2]
173#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
174#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
175#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
176#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
177#define XFIX_1_175875602                    d1[3]
178#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
179#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
180#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
181#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
182
183.balign 16
184jsimd_idct_islow_neon_consts:
185    .short FIX_0_899976223                    /* d0[0] */
186    .short FIX_0_541196100                    /* d0[1] */
187    .short FIX_2_562915447                    /* d0[2] */
188    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
189    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
190    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
191    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
192    .short FIX_1_175875602                    /* d1[3] */
193    /* reloadable constants */
194    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
195    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
196    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
197    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
198
199asm_function jsimd_idct_islow_neon
200
201    DCT_TABLE       .req r0
202    COEF_BLOCK      .req r1
203    OUTPUT_BUF      .req r2
204    OUTPUT_COL      .req r3
205    TMP1            .req r0
206    TMP2            .req r1
207    TMP3            .req r2
208    TMP4            .req ip
209
210    ROW0L           .req d16
211    ROW0R           .req d17
212    ROW1L           .req d18
213    ROW1R           .req d19
214    ROW2L           .req d20
215    ROW2R           .req d21
216    ROW3L           .req d22
217    ROW3R           .req d23
218    ROW4L           .req d24
219    ROW4R           .req d25
220    ROW5L           .req d26
221    ROW5R           .req d27
222    ROW6L           .req d28
223    ROW6R           .req d29
224    ROW7L           .req d30
225    ROW7R           .req d31
226
227    /* Load and dequantize coefficients into NEON registers
228     * with the following allocation:
229     *       0 1 2 3 | 4 5 6 7
230     *      ---------+--------
231     *   0 | d16     | d17     ( q8  )
232     *   1 | d18     | d19     ( q9  )
233     *   2 | d20     | d21     ( q10 )
234     *   3 | d22     | d23     ( q11 )
235     *   4 | d24     | d25     ( q12 )
236     *   5 | d26     | d27     ( q13 )
237     *   6 | d28     | d29     ( q14 )
238     *   7 | d30     | d31     ( q15 )
239     */
240    adr             ip, jsimd_idct_islow_neon_consts
241    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
242    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
243    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
244    vmul.s16        q8, q8, q0
245    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
246    vmul.s16        q9, q9, q1
247    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
248    vmul.s16        q10, q10, q2
249    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
250    vmul.s16        q11, q11, q3
251    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
252    vmul.s16        q12, q12, q0
253    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
254    vmul.s16        q14, q14, q2
255    vmul.s16        q13, q13, q1
256    vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
257    add             ip, ip, #16
258    vmul.s16        q15, q15, q3
259    vpush           {d8-d15} /* save NEON registers */
260    /* 1-D IDCT, pass 1, left 4x8 half */
261    vadd.s16        d4,    ROW7L, ROW3L
262    vadd.s16        d5,    ROW5L, ROW1L
263    vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
264    vmlal.s16       q6,    d5,    XFIX_1_175875602
265    vmull.s16       q7,    d4,    XFIX_1_175875602
266      /* Check for the zero coefficients in the right 4x8 half */
267      push            {r4, r5}
268    vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
269    vsubl.s16       q3,    ROW0L, ROW4L
270      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
271    vmull.s16       q2,    ROW2L, XFIX_0_541196100
272    vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
273      orr             r0,    r4,    r5
274    vmov            q4,    q6
275    vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
276      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
277    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
278    vshl.s32        q3,    q3,    #13
279      orr             r0,    r0,    r4
280    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
281      orr             r0,    r0,    r5
282    vadd.s32        q1,    q3,    q2
283      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
284    vmov            q5,    q7
285    vadd.s32        q1,    q1,    q6
286      orr             r0,    r0,    r4
287    vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
288      orr             r0,    r0,    r5
289    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
290    vrshrn.s32      ROW1L, q1,    #11
291      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
292    vsub.s32        q1,    q1,    q6
293    vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
294      orr             r0,    r0,    r4
295    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
296      orr             r0,    r0,    r5
297    vsub.s32        q1,    q1,    q6
298    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
299      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
300    vmlal.s16       q6,    ROW6L, XFIX_0_541196100
301    vsub.s32        q3,    q3,    q2
302      orr             r0,    r0,    r4
303    vrshrn.s32      ROW6L, q1,    #11
304      orr             r0,    r0,    r5
305    vadd.s32        q1,    q3,    q5
306      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
307    vsub.s32        q3,    q3,    q5
308    vaddl.s16       q5,    ROW0L, ROW4L
309      orr             r0,    r0,    r4
310    vrshrn.s32      ROW2L, q1,    #11
311      orr             r0,    r0,    r5
312    vrshrn.s32      ROW5L, q3,    #11
313      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
314    vshl.s32        q5,    q5,    #13
315    vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
316      orr             r0,    r0,    r4
317    vadd.s32        q2,    q5,    q6
318      orrs            r0,    r0,    r5
319    vsub.s32        q1,    q5,    q6
320    vadd.s32        q6,    q2,    q7
321      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
322    vsub.s32        q2,    q2,    q7
323    vadd.s32        q5,    q1,    q4
324      orr             r0,    r4,    r5
325    vsub.s32        q3,    q1,    q4
326      pop             {r4, r5}
327    vrshrn.s32      ROW7L, q2,    #11
328    vrshrn.s32      ROW3L, q5,    #11
329    vrshrn.s32      ROW0L, q6,    #11
330    vrshrn.s32      ROW4L, q3,    #11
331
332      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
333
334    /* 1-D IDCT, pass 1, right 4x8 half */
335    vld1.s16        {d2},  [ip, :64]    /* reload constants */
336    vadd.s16        d10,   ROW7R, ROW3R
337    vadd.s16        d8,    ROW5R, ROW1R
338      /* Transpose left 4x8 half */
339      vtrn.16         ROW6L, ROW7L
340    vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
341    vmlal.s16       q6,    d8,    XFIX_1_175875602
342      vtrn.16         ROW2L, ROW3L
343    vmull.s16       q7,    d10,   XFIX_1_175875602
344    vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
345      vtrn.16         ROW0L, ROW1L
346    vsubl.s16       q3,    ROW0R, ROW4R
347    vmull.s16       q2,    ROW2R, XFIX_0_541196100
348    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
349      vtrn.16         ROW4L, ROW5L
350    vmov            q4,    q6
351    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
352    vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
353      vtrn.32         ROW1L, ROW3L
354    vshl.s32        q3,    q3,    #13
355    vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
356      vtrn.32         ROW4L, ROW6L
357    vadd.s32        q1,    q3,    q2
358    vmov            q5,    q7
359    vadd.s32        q1,    q1,    q6
360      vtrn.32         ROW0L, ROW2L
361    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
362    vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
363    vrshrn.s32      ROW1R, q1,    #11
364      vtrn.32         ROW5L, ROW7L
365    vsub.s32        q1,    q1,    q6
366    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
367    vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
368    vsub.s32        q1,    q1,    q6
369    vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
370    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
371    vsub.s32        q3,    q3,    q2
372    vrshrn.s32      ROW6R, q1,    #11
373    vadd.s32        q1,    q3,    q5
374    vsub.s32        q3,    q3,    q5
375    vaddl.s16       q5,    ROW0R, ROW4R
376    vrshrn.s32      ROW2R, q1,    #11
377    vrshrn.s32      ROW5R, q3,    #11
378    vshl.s32        q5,    q5,    #13
379    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
380    vadd.s32        q2,    q5,    q6
381    vsub.s32        q1,    q5,    q6
382    vadd.s32        q6,    q2,    q7
383    vsub.s32        q2,    q2,    q7
384    vadd.s32        q5,    q1,    q4
385    vsub.s32        q3,    q1,    q4
386    vrshrn.s32      ROW7R, q2,    #11
387    vrshrn.s32      ROW3R, q5,    #11
388    vrshrn.s32      ROW0R, q6,    #11
389    vrshrn.s32      ROW4R, q3,    #11
390    /* Transpose right 4x8 half */
391    vtrn.16         ROW6R, ROW7R
392    vtrn.16         ROW2R, ROW3R
393    vtrn.16         ROW0R, ROW1R
394    vtrn.16         ROW4R, ROW5R
395    vtrn.32         ROW1R, ROW3R
396    vtrn.32         ROW4R, ROW6R
397    vtrn.32         ROW0R, ROW2R
398    vtrn.32         ROW5R, ROW7R
399
4001:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
401    vld1.s16        {d2},  [ip, :64]    /* reload constants */
402    vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
403    vmlal.s16       q6,    ROW1L, XFIX_1_175875602
404    vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
405    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
406    vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
407    vmlal.s16       q7,    ROW3L, XFIX_1_175875602
408    vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
409    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
410    vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
411    vmull.s16       q2,    ROW2L, XFIX_0_541196100
412    vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
413    vmov            q4,    q6
414    vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
415    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
416    vshl.s32        q3,    q3,    #13
417    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
418    vadd.s32        q1,    q3,    q2
419    vmov            q5,    q7
420    vadd.s32        q1,    q1,    q6
421    vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
422    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
423    vshrn.s32       ROW1L, q1,    #16
424    vsub.s32        q1,    q1,    q6
425    vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
426    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
427    vsub.s32        q1,    q1,    q6
428    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
429    vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
430    vsub.s32        q3,    q3,    q2
431    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
432    vadd.s32        q1,    q3,    q5
433    vsub.s32        q3,    q3,    q5
434    vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
435    vshrn.s32       ROW2L, q1,    #16
436    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
437    vshl.s32        q5,    q5,    #13
438    vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
439    vadd.s32        q2,    q5,    q6
440    vsub.s32        q1,    q5,    q6
441    vadd.s32        q6,    q2,    q7
442    vsub.s32        q2,    q2,    q7
443    vadd.s32        q5,    q1,    q4
444    vsub.s32        q3,    q1,    q4
445    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
446    vshrn.s32       ROW3L, q5,    #16
447    vshrn.s32       ROW0L, q6,    #16
448    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
449    /* 1-D IDCT, pass 2, right 4x8 half */
450    vld1.s16        {d2},  [ip, :64]    /* reload constants */
451    vmull.s16       q6,    ROW5R, XFIX_1_175875602
452    vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
453    vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
454    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
455    vmull.s16       q7,    ROW7R, XFIX_1_175875602
456    vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
457    vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
458    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
459    vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
460    vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
461    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
462    vmov            q4,    q6
463    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
464    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
465    vshl.s32        q3,    q3,    #13
466    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
467    vadd.s32        q1,    q3,    q2
468    vmov            q5,    q7
469    vadd.s32        q1,    q1,    q6
470    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
471    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
472    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
473    vsub.s32        q1,    q1,    q6
474    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
475    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
476    vsub.s32        q1,    q1,    q6
477    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
478    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
479    vsub.s32        q3,    q3,    q2
480    vshrn.s32       ROW6R, q1,    #16
481    vadd.s32        q1,    q3,    q5
482    vsub.s32        q3,    q3,    q5
483    vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
484    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
485    vshrn.s32       ROW5R, q3,    #16
486    vshl.s32        q5,    q5,    #13
487    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
488    vadd.s32        q2,    q5,    q6
489    vsub.s32        q1,    q5,    q6
490    vadd.s32        q6,    q2,    q7
491    vsub.s32        q2,    q2,    q7
492    vadd.s32        q5,    q1,    q4
493    vsub.s32        q3,    q1,    q4
494    vshrn.s32       ROW7R, q2,    #16
495    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
496    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
497    vshrn.s32       ROW4R, q3,    #16
498
4992:  /* Descale to 8-bit and range limit */
500    vqrshrn.s16     d16,   q8,    #2
501    vqrshrn.s16     d17,   q9,    #2
502    vqrshrn.s16     d18,   q10,   #2
503    vqrshrn.s16     d19,   q11,   #2
504    vpop            {d8-d15} /* restore NEON registers */
505    vqrshrn.s16     d20,   q12,   #2
506      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
507      vtrn.16         q8,    q9
508    vqrshrn.s16     d21,   q13,   #2
509    vqrshrn.s16     d22,   q14,   #2
510      vmov.u8         q0,    #(CENTERJSAMPLE)
511    vqrshrn.s16     d23,   q15,   #2
512      vtrn.8          d16,   d17
513      vtrn.8          d18,   d19
514      vadd.u8         q8,    q8,    q0
515      vadd.u8         q9,    q9,    q0
516      vtrn.16         q10,   q11
517        /* Store results to the output buffer */
518        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
519        add             TMP1, TMP1, OUTPUT_COL
520        add             TMP2, TMP2, OUTPUT_COL
521        vst1.8          {d16}, [TMP1]
522      vtrn.8          d20, d21
523        vst1.8          {d17}, [TMP2]
524        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
525        add             TMP1, TMP1, OUTPUT_COL
526        add             TMP2, TMP2, OUTPUT_COL
527        vst1.8          {d18}, [TMP1]
528      vadd.u8         q10,   q10,   q0
529        vst1.8          {d19}, [TMP2]
530        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
531        add             TMP1, TMP1, OUTPUT_COL
532        add             TMP2, TMP2, OUTPUT_COL
533        add             TMP3, TMP3, OUTPUT_COL
534        add             TMP4, TMP4, OUTPUT_COL
535      vtrn.8          d22, d23
536        vst1.8          {d20}, [TMP1]
537      vadd.u8         q11,   q11,   q0
538        vst1.8          {d21}, [TMP2]
539        vst1.8          {d22}, [TMP3]
540        vst1.8          {d23}, [TMP4]
541    bx              lr
542
5433:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
544
545    /* Transpose left 4x8 half */
546    vtrn.16         ROW6L, ROW7L
547    vtrn.16         ROW2L, ROW3L
548    vtrn.16         ROW0L, ROW1L
549    vtrn.16         ROW4L, ROW5L
550    vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
551    vtrn.32         ROW1L, ROW3L
552    vtrn.32         ROW4L, ROW6L
553    vtrn.32         ROW0L, ROW2L
554    vtrn.32         ROW5L, ROW7L
555
556    cmp             r0, #0
557    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
558
559    /* Only row 0 is non-zero for the right 4x8 half  */
560    vdup.s16        ROW1R, ROW0R[1]
561    vdup.s16        ROW2R, ROW0R[2]
562    vdup.s16        ROW3R, ROW0R[3]
563    vdup.s16        ROW4R, ROW0R[0]
564    vdup.s16        ROW5R, ROW0R[1]
565    vdup.s16        ROW6R, ROW0R[2]
566    vdup.s16        ROW7R, ROW0R[3]
567    vdup.s16        ROW0R, ROW0R[0]
568    b               1b /* Go to 'normal' second pass */
569
5704:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
571    vld1.s16        {d2},  [ip, :64]    /* reload constants */
572    vmull.s16       q6,    ROW1L, XFIX_1_175875602
573    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
574    vmull.s16       q7,    ROW3L, XFIX_1_175875602
575    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
576    vmull.s16       q2,    ROW2L, XFIX_0_541196100
577    vshll.s16       q3,    ROW0L, #13
578    vmov            q4,    q6
579    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
580    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
581    vadd.s32        q1,    q3,    q2
582    vmov            q5,    q7
583    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
584    vadd.s32        q1,    q1,    q6
585    vadd.s32        q6,    q6,    q6
586    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
587    vshrn.s32       ROW1L, q1,    #16
588    vsub.s32        q1,    q1,    q6
589    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
590    vsub.s32        q3,    q3,    q2
591    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
592    vadd.s32        q1,    q3,    q5
593    vsub.s32        q3,    q3,    q5
594    vshll.s16       q5,    ROW0L, #13
595    vshrn.s32       ROW2L, q1,    #16
596    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
597    vadd.s32        q2,    q5,    q6
598    vsub.s32        q1,    q5,    q6
599    vadd.s32        q6,    q2,    q7
600    vsub.s32        q2,    q2,    q7
601    vadd.s32        q5,    q1,    q4
602    vsub.s32        q3,    q1,    q4
603    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
604    vshrn.s32       ROW3L, q5,    #16
605    vshrn.s32       ROW0L, q6,    #16
606    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
607    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
608    vld1.s16        {d2},  [ip, :64]    /* reload constants */
609    vmull.s16       q6,    ROW5L, XFIX_1_175875602
610    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
611    vmull.s16       q7,    ROW7L, XFIX_1_175875602
612    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
613    vmull.s16       q2,    ROW6L, XFIX_0_541196100
614    vshll.s16       q3,    ROW4L, #13
615    vmov            q4,    q6
616    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
617    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
618    vadd.s32        q1,    q3,    q2
619    vmov            q5,    q7
620    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
621    vadd.s32        q1,    q1,    q6
622    vadd.s32        q6,    q6,    q6
623    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
624    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
625    vsub.s32        q1,    q1,    q6
626    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
627    vsub.s32        q3,    q3,    q2
628    vshrn.s32       ROW6R, q1,    #16
629    vadd.s32        q1,    q3,    q5
630    vsub.s32        q3,    q3,    q5
631    vshll.s16       q5,    ROW4L, #13
632    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
633    vshrn.s32       ROW5R, q3,    #16
634    vadd.s32        q2,    q5,    q6
635    vsub.s32        q1,    q5,    q6
636    vadd.s32        q6,    q2,    q7
637    vsub.s32        q2,    q2,    q7
638    vadd.s32        q5,    q1,    q4
639    vsub.s32        q3,    q1,    q4
640    vshrn.s32       ROW7R, q2,    #16
641    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
642    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
643    vshrn.s32       ROW4R, q3,    #16
644    b               2b /* Go to epilogue */
645
646    .unreq          DCT_TABLE
647    .unreq          COEF_BLOCK
648    .unreq          OUTPUT_BUF
649    .unreq          OUTPUT_COL
650    .unreq          TMP1
651    .unreq          TMP2
652    .unreq          TMP3
653    .unreq          TMP4
654
655    .unreq          ROW0L
656    .unreq          ROW0R
657    .unreq          ROW1L
658    .unreq          ROW1R
659    .unreq          ROW2L
660    .unreq          ROW2R
661    .unreq          ROW3L
662    .unreq          ROW3R
663    .unreq          ROW4L
664    .unreq          ROW4R
665    .unreq          ROW5L
666    .unreq          ROW5R
667    .unreq          ROW6L
668    .unreq          ROW6R
669    .unreq          ROW7L
670    .unreq          ROW7R
671
672
673/*****************************************************************************/
674
675/*
676 * jsimd_idct_ifast_neon
677 *
678 * This function contains a fast, not so accurate integer implementation of
679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
681 * function from jidctfst.c
682 *
683 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
684 * But in ARM NEON case some extra additions are required because VQDMULH
685 * instruction can't handle the constants larger than 1. So the expressions
686 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
687 * which introduces an extra addition. Overall, there are 6 extra additions
688 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
689 */
690
691#define XFIX_1_082392200 d0[0]
692#define XFIX_1_414213562 d0[1]
693#define XFIX_1_847759065 d0[2]
694#define XFIX_2_613125930 d0[3]
695
696.balign 16
697jsimd_idct_ifast_neon_consts:
698    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
699    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
700    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
701    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
702
703asm_function jsimd_idct_ifast_neon
704
705    DCT_TABLE       .req r0
706    COEF_BLOCK      .req r1
707    OUTPUT_BUF      .req r2
708    OUTPUT_COL      .req r3
709    TMP1            .req r0
710    TMP2            .req r1
711    TMP3            .req r2
712    TMP4            .req ip
713
714    /* Load and dequantize coefficients into NEON registers
715     * with the following allocation:
716     *       0 1 2 3 | 4 5 6 7
717     *      ---------+--------
718     *   0 | d16     | d17     ( q8  )
719     *   1 | d18     | d19     ( q9  )
720     *   2 | d20     | d21     ( q10 )
721     *   3 | d22     | d23     ( q11 )
722     *   4 | d24     | d25     ( q12 )
723     *   5 | d26     | d27     ( q13 )
724     *   6 | d28     | d29     ( q14 )
725     *   7 | d30     | d31     ( q15 )
726     */
727    adr             ip, jsimd_idct_ifast_neon_consts
728    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
729    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
730    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
731    vmul.s16        q8,  q8,  q0
732    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
733    vmul.s16        q9,  q9,  q1
734    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
735    vmul.s16        q10, q10, q2
736    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
737    vmul.s16        q11, q11, q3
738    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
739    vmul.s16        q12, q12, q0
740    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
741    vmul.s16        q14, q14, q2
742    vmul.s16        q13, q13, q1
743    vld1.16         {d0}, [ip, :64] /* load constants */
744    vmul.s16        q15, q15, q3
745    vpush           {d8-d13}        /* save NEON registers */
746    /* 1-D IDCT, pass 1 */
747    vsub.s16        q2,  q10, q14
748    vadd.s16        q14, q10, q14
749    vsub.s16        q1,  q11, q13
750    vadd.s16        q13, q11, q13
751    vsub.s16        q5,  q9,  q15
752    vadd.s16        q15, q9,  q15
753    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
754    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
755    vadd.s16        q3,  q1,  q1
756    vsub.s16        q1,  q5,  q1
757    vadd.s16        q10, q2,  q4
758    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
759    vsub.s16        q2,  q15, q13
760    vadd.s16        q3,  q3,  q6
761    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
762    vadd.s16        q1,  q1,  q4
763    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
764    vsub.s16        q10, q10, q14
765    vadd.s16        q2,  q2,  q6
766    vsub.s16        q6,  q8,  q12
767    vadd.s16        q12, q8,  q12
768    vadd.s16        q9,  q5,  q4
769    vadd.s16        q5,  q6,  q10
770    vsub.s16        q10, q6,  q10
771    vadd.s16        q6,  q15, q13
772    vadd.s16        q8,  q12, q14
773    vsub.s16        q3,  q6,  q3
774    vsub.s16        q12, q12, q14
775    vsub.s16        q3,  q3,  q1
776    vsub.s16        q1,  q9,  q1
777    vadd.s16        q2,  q3,  q2
778    vsub.s16        q15, q8,  q6
779    vadd.s16        q1,  q1,  q2
780    vadd.s16        q8,  q8,  q6
781    vadd.s16        q14, q5,  q3
782    vsub.s16        q9,  q5,  q3
783    vsub.s16        q13, q10, q2
784    vadd.s16        q10, q10, q2
785      /* Transpose */
786      vtrn.16         q8,  q9
787    vsub.s16        q11, q12, q1
788      vtrn.16         q14, q15
789    vadd.s16        q12, q12, q1
790      vtrn.16         q10, q11
791      vtrn.16         q12, q13
792      vtrn.32         q9,  q11
793      vtrn.32         q12, q14
794      vtrn.32         q8,  q10
795      vtrn.32         q13, q15
796      vswp            d28, d21
797      vswp            d26, d19
798    /* 1-D IDCT, pass 2 */
799    vsub.s16        q2,  q10, q14
800      vswp            d30, d23
801    vadd.s16        q14, q10, q14
802      vswp            d24, d17
803    vsub.s16        q1,  q11, q13
804    vadd.s16        q13, q11, q13
805    vsub.s16        q5,  q9,  q15
806    vadd.s16        q15, q9,  q15
807    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
808    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
809    vadd.s16        q3,  q1,  q1
810    vsub.s16        q1,  q5,  q1
811    vadd.s16        q10, q2,  q4
812    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
813    vsub.s16        q2,  q15, q13
814    vadd.s16        q3,  q3,  q6
815    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
816    vadd.s16        q1,  q1,  q4
817    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
818    vsub.s16        q10, q10, q14
819    vadd.s16        q2,  q2,  q6
820    vsub.s16        q6,  q8,  q12
821    vadd.s16        q12, q8,  q12
822    vadd.s16        q9,  q5,  q4
823    vadd.s16        q5,  q6,  q10
824    vsub.s16        q10, q6,  q10
825    vadd.s16        q6,  q15, q13
826    vadd.s16        q8,  q12, q14
827    vsub.s16        q3,  q6,  q3
828    vsub.s16        q12, q12, q14
829    vsub.s16        q3,  q3,  q1
830    vsub.s16        q1,  q9,  q1
831    vadd.s16        q2,  q3,  q2
832    vsub.s16        q15, q8,  q6
833    vadd.s16        q1,  q1,  q2
834    vadd.s16        q8,  q8,  q6
835    vadd.s16        q14, q5,  q3
836    vsub.s16        q9,  q5,  q3
837    vsub.s16        q13, q10, q2
838    vpop            {d8-d13}        /* restore NEON registers */
839    vadd.s16        q10, q10, q2
840    vsub.s16        q11, q12, q1
841    vadd.s16        q12, q12, q1
842    /* Descale to 8-bit and range limit */
843    vmov.u8         q0,  #0x80
844    vqshrn.s16      d16, q8,  #5
845    vqshrn.s16      d17, q9,  #5
846    vqshrn.s16      d18, q10, #5
847    vqshrn.s16      d19, q11, #5
848    vqshrn.s16      d20, q12, #5
849    vqshrn.s16      d21, q13, #5
850    vqshrn.s16      d22, q14, #5
851    vqshrn.s16      d23, q15, #5
852    vadd.u8         q8,  q8,  q0
853    vadd.u8         q9,  q9,  q0
854    vadd.u8         q10, q10, q0
855    vadd.u8         q11, q11, q0
856    /* Transpose the final 8-bit samples */
857    vtrn.16         q8,  q9
858    vtrn.16         q10, q11
859    vtrn.32         q8,  q10
860    vtrn.32         q9,  q11
861    vtrn.8          d16, d17
862    vtrn.8          d18, d19
863      /* Store results to the output buffer */
864      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
865      add             TMP1, TMP1, OUTPUT_COL
866      add             TMP2, TMP2, OUTPUT_COL
867      vst1.8          {d16}, [TMP1]
868      vst1.8          {d17}, [TMP2]
869      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
870      add             TMP1, TMP1, OUTPUT_COL
871      add             TMP2, TMP2, OUTPUT_COL
872      vst1.8          {d18}, [TMP1]
873    vtrn.8          d20, d21
874      vst1.8          {d19}, [TMP2]
875      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
876      add             TMP1, TMP1, OUTPUT_COL
877      add             TMP2, TMP2, OUTPUT_COL
878      add             TMP3, TMP3, OUTPUT_COL
879      add             TMP4, TMP4, OUTPUT_COL
880      vst1.8          {d20}, [TMP1]
881    vtrn.8          d22, d23
882      vst1.8          {d21}, [TMP2]
883      vst1.8          {d22}, [TMP3]
884      vst1.8          {d23}, [TMP4]
885    bx              lr
886
887    .unreq          DCT_TABLE
888    .unreq          COEF_BLOCK
889    .unreq          OUTPUT_BUF
890    .unreq          OUTPUT_COL
891    .unreq          TMP1
892    .unreq          TMP2
893    .unreq          TMP3
894    .unreq          TMP4
895
896
897/*****************************************************************************/
898
899/*
900 * jsimd_idct_4x4_neon
901 *
902 * This function contains inverse-DCT code for getting reduced-size
903 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
905 * function from jpeg-6b (jidctred.c).
906 *
907 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
908 *       requires much less arithmetic operations and hence should be faster.
909 *       The primary purpose of this particular NEON optimized function is
910 *       bit exact compatibility with jpeg-6b.
911 *
912 * TODO: a bit better instructions scheduling can be achieved by expanding
913 *       idct_helper/transpose_4x4 macros and reordering instructions,
914 *       but readability will suffer somewhat.
915 */
916
917#define CONST_BITS  13
918
919#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
920#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
921#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
922#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
923#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
924#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
925#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
926#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
927#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
928#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
929#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
930#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
931#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
932#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
933
934.balign 16
935jsimd_idct_4x4_neon_consts:
936    .short     FIX_1_847759065     /* d0[0] */
937    .short     -FIX_0_765366865    /* d0[1] */
938    .short     -FIX_0_211164243    /* d0[2] */
939    .short     FIX_1_451774981     /* d0[3] */
940    .short     -FIX_2_172734803    /* d1[0] */
941    .short     FIX_1_061594337     /* d1[1] */
942    .short     -FIX_0_509795579    /* d1[2] */
943    .short     -FIX_0_601344887    /* d1[3] */
944    .short     FIX_0_899976223     /* d2[0] */
945    .short     FIX_2_562915447     /* d2[1] */
946    .short     1 << (CONST_BITS+1) /* d2[2] */
947    .short     0                   /* d2[3] */
948
949.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
950    vmull.s16       q14, \x4,  d2[2]
951    vmlal.s16       q14, \x8,  d0[0]
952    vmlal.s16       q14, \x14, d0[1]
953
954    vmull.s16       q13, \x16, d1[2]
955    vmlal.s16       q13, \x12, d1[3]
956    vmlal.s16       q13, \x10, d2[0]
957    vmlal.s16       q13, \x6,  d2[1]
958
959    vmull.s16       q15, \x4,  d2[2]
960    vmlsl.s16       q15, \x8,  d0[0]
961    vmlsl.s16       q15, \x14, d0[1]
962
963    vmull.s16       q12, \x16, d0[2]
964    vmlal.s16       q12, \x12, d0[3]
965    vmlal.s16       q12, \x10, d1[0]
966    vmlal.s16       q12, \x6,  d1[1]
967
968    vadd.s32        q10, q14, q13
969    vsub.s32        q14, q14, q13
970
971.if \shift > 16
972    vrshr.s32       q10,  q10, #\shift
973    vrshr.s32       q14,  q14, #\shift
974    vmovn.s32       \y26, q10
975    vmovn.s32       \y29, q14
976.else
977    vrshrn.s32      \y26, q10, #\shift
978    vrshrn.s32      \y29, q14, #\shift
979.endif
980
981    vadd.s32        q10, q15, q12
982    vsub.s32        q15, q15, q12
983
984.if \shift > 16
985    vrshr.s32       q10,  q10, #\shift
986    vrshr.s32       q15,  q15, #\shift
987    vmovn.s32       \y27, q10
988    vmovn.s32       \y28, q15
989.else
990    vrshrn.s32      \y27, q10, #\shift
991    vrshrn.s32      \y28, q15, #\shift
992.endif
993
994.endm
995
996asm_function jsimd_idct_4x4_neon
997
998    DCT_TABLE       .req r0
999    COEF_BLOCK      .req r1
1000    OUTPUT_BUF      .req r2
1001    OUTPUT_COL      .req r3
1002    TMP1            .req r0
1003    TMP2            .req r1
1004    TMP3            .req r2
1005    TMP4            .req ip
1006
1007    vpush           {d8-d15}
1008
1009    /* Load constants (d3 is just used for padding) */
1010    adr             TMP4, jsimd_idct_4x4_neon_consts
1011    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
1012
1013    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1014     *       0 1 2 3 | 4 5 6 7
1015     *      ---------+--------
1016     *   0 | d4      | d5
1017     *   1 | d6      | d7
1018     *   2 | d8      | d9
1019     *   3 | d10     | d11
1020     *   4 | -       | -
1021     *   5 | d12     | d13
1022     *   6 | d14     | d15
1023     *   7 | d16     | d17
1024     */
1025    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1026    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1027    add COEF_BLOCK, COEF_BLOCK, #16
1028    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1029    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
1030    /* dequantize */
1031    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1032    vmul.s16        q2, q2, q9
1033    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1034    vmul.s16        q3, q3, q10
1035    vmul.s16        q4, q4, q11
1036    add             DCT_TABLE, DCT_TABLE, #16
1037    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1038    vmul.s16        q5, q5, q12
1039    vmul.s16        q6, q6, q13
1040    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
1041    vmul.s16        q7, q7, q14
1042    vmul.s16        q8, q8, q15
1043
1044    /* Pass 1 */
1045    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1046    transpose_4x4   d4, d6, d8, d10
1047    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1048    transpose_4x4   d5, d7, d9, d11
1049
1050    /* Pass 2 */
1051    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1052    transpose_4x4   d26, d27, d28, d29
1053
1054    /* Range limit */
1055    vmov.u16        q15, #0x80
1056    vadd.s16        q13, q13, q15
1057    vadd.s16        q14, q14, q15
1058    vqmovun.s16     d26, q13
1059    vqmovun.s16     d27, q14
1060
1061    /* Store results to the output buffer */
1062    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1063    add             TMP1, TMP1, OUTPUT_COL
1064    add             TMP2, TMP2, OUTPUT_COL
1065    add             TMP3, TMP3, OUTPUT_COL
1066    add             TMP4, TMP4, OUTPUT_COL
1067
1068#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1069    /* We can use much less instructions on little endian systems if the
1070     * OS kernel is not configured to trap unaligned memory accesses
1071     */
1072    vst1.32         {d26[0]}, [TMP1]!
1073    vst1.32         {d27[0]}, [TMP3]!
1074    vst1.32         {d26[1]}, [TMP2]!
1075    vst1.32         {d27[1]}, [TMP4]!
1076#else
1077    vst1.8          {d26[0]}, [TMP1]!
1078    vst1.8          {d27[0]}, [TMP3]!
1079    vst1.8          {d26[1]}, [TMP1]!
1080    vst1.8          {d27[1]}, [TMP3]!
1081    vst1.8          {d26[2]}, [TMP1]!
1082    vst1.8          {d27[2]}, [TMP3]!
1083    vst1.8          {d26[3]}, [TMP1]!
1084    vst1.8          {d27[3]}, [TMP3]!
1085
1086    vst1.8          {d26[4]}, [TMP2]!
1087    vst1.8          {d27[4]}, [TMP4]!
1088    vst1.8          {d26[5]}, [TMP2]!
1089    vst1.8          {d27[5]}, [TMP4]!
1090    vst1.8          {d26[6]}, [TMP2]!
1091    vst1.8          {d27[6]}, [TMP4]!
1092    vst1.8          {d26[7]}, [TMP2]!
1093    vst1.8          {d27[7]}, [TMP4]!
1094#endif
1095
1096    vpop            {d8-d15}
1097    bx              lr
1098
1099    .unreq          DCT_TABLE
1100    .unreq          COEF_BLOCK
1101    .unreq          OUTPUT_BUF
1102    .unreq          OUTPUT_COL
1103    .unreq          TMP1
1104    .unreq          TMP2
1105    .unreq          TMP3
1106    .unreq          TMP4
1107
1108.purgem idct_helper
1109
1110
1111/*****************************************************************************/
1112
1113/*
1114 * jsimd_idct_2x2_neon
1115 *
1116 * This function contains inverse-DCT code for getting reduced-size
1117 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1118 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1119 * function from jpeg-6b (jidctred.c).
1120 *
1121 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1122 *       requires much less arithmetic operations and hence should be faster.
1123 *       The primary purpose of this particular NEON optimized function is
1124 *       bit exact compatibility with jpeg-6b.
1125 */
1126
1127.balign 8
1128jsimd_idct_2x2_neon_consts:
1129    .short     -FIX_0_720959822    /* d0[0] */
1130    .short     FIX_0_850430095     /* d0[1] */
1131    .short     -FIX_1_272758580    /* d0[2] */
1132    .short     FIX_3_624509785     /* d0[3] */
1133
1134.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1135    vshll.s16  q14,  \x4,  #15
1136    vmull.s16  q13,  \x6,  d0[3]
1137    vmlal.s16  q13,  \x10, d0[2]
1138    vmlal.s16  q13,  \x12, d0[1]
1139    vmlal.s16  q13,  \x16, d0[0]
1140
1141    vadd.s32   q10,  q14,  q13
1142    vsub.s32   q14,  q14,  q13
1143
1144.if \shift > 16
1145    vrshr.s32  q10,  q10,  #\shift
1146    vrshr.s32  q14,  q14,  #\shift
1147    vmovn.s32  \y26, q10
1148    vmovn.s32  \y27, q14
1149.else
1150    vrshrn.s32 \y26, q10,  #\shift
1151    vrshrn.s32 \y27, q14,  #\shift
1152.endif
1153
1154.endm
1155
1156asm_function jsimd_idct_2x2_neon
1157
1158    DCT_TABLE       .req r0
1159    COEF_BLOCK      .req r1
1160    OUTPUT_BUF      .req r2
1161    OUTPUT_COL      .req r3
1162    TMP1            .req r0
1163    TMP2            .req ip
1164
1165    vpush           {d8-d15}
1166
1167    /* Load constants */
1168    adr             TMP2, jsimd_idct_2x2_neon_consts
1169    vld1.16         {d0}, [TMP2, :64]
1170
1171    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1172     *       0 1 2 3 | 4 5 6 7
1173     *      ---------+--------
1174     *   0 | d4      | d5
1175     *   1 | d6      | d7
1176     *   2 | -       | -
1177     *   3 | d10     | d11
1178     *   4 | -       | -
1179     *   5 | d12     | d13
1180     *   6 | -       | -
1181     *   7 | d16     | d17
1182     */
1183    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1184    add             COEF_BLOCK, COEF_BLOCK, #16
1185    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
1186    add             COEF_BLOCK, COEF_BLOCK, #16
1187    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
1188    add             COEF_BLOCK, COEF_BLOCK, #16
1189    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
1190    /* Dequantize */
1191    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1192    vmul.s16        q2, q2, q9
1193    vmul.s16        q3, q3, q10
1194    add             DCT_TABLE, DCT_TABLE, #16
1195    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
1196    vmul.s16        q5, q5, q12
1197    add             DCT_TABLE, DCT_TABLE, #16
1198    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
1199    vmul.s16        q6, q6, q13
1200    add             DCT_TABLE, DCT_TABLE, #16
1201    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
1202    vmul.s16        q8, q8, q15
1203
1204    /* Pass 1 */
1205#if 0
1206    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
1207    transpose_4x4   d4, d6, d8,  d10
1208    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
1209    transpose_4x4   d5, d7, d9,  d11
1210#else
1211    vmull.s16       q13, d6,  d0[3]
1212    vmlal.s16       q13, d10, d0[2]
1213    vmlal.s16       q13, d12, d0[1]
1214    vmlal.s16       q13, d16, d0[0]
1215    vmull.s16       q12, d7,  d0[3]
1216    vmlal.s16       q12, d11, d0[2]
1217    vmlal.s16       q12, d13, d0[1]
1218    vmlal.s16       q12, d17, d0[0]
1219    vshll.s16       q14, d4,  #15
1220    vshll.s16       q15, d5,  #15
1221    vadd.s32        q10, q14, q13
1222    vsub.s32        q14, q14, q13
1223    vrshrn.s32      d4,  q10, #13
1224    vrshrn.s32      d6,  q14, #13
1225    vadd.s32        q10, q15, q12
1226    vsub.s32        q14, q15, q12
1227    vrshrn.s32      d5,  q10, #13
1228    vrshrn.s32      d7,  q14, #13
1229    vtrn.16         q2,  q3
1230    vtrn.32         q3,  q5
1231#endif
1232
1233    /* Pass 2 */
1234    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
1235
1236    /* Range limit */
1237    vmov.u16        q15, #0x80
1238    vadd.s16        q13, q13, q15
1239    vqmovun.s16     d26, q13
1240    vqmovun.s16     d27, q13
1241
1242    /* Store results to the output buffer */
1243    ldmia           OUTPUT_BUF, {TMP1, TMP2}
1244    add             TMP1, TMP1, OUTPUT_COL
1245    add             TMP2, TMP2, OUTPUT_COL
1246
1247    vst1.8          {d26[0]}, [TMP1]!
1248    vst1.8          {d27[4]}, [TMP1]!
1249    vst1.8          {d26[1]}, [TMP2]!
1250    vst1.8          {d27[5]}, [TMP2]!
1251
1252    vpop            {d8-d15}
1253    bx              lr
1254
1255    .unreq          DCT_TABLE
1256    .unreq          COEF_BLOCK
1257    .unreq          OUTPUT_BUF
1258    .unreq          OUTPUT_COL
1259    .unreq          TMP1
1260    .unreq          TMP2
1261
1262.purgem idct_helper
1263
1264
1265/*****************************************************************************/
1266
1267/*
1268 * jsimd_ycc_extrgb_convert_neon
1269 * jsimd_ycc_extbgr_convert_neon
1270 * jsimd_ycc_extrgbx_convert_neon
1271 * jsimd_ycc_extbgrx_convert_neon
1272 * jsimd_ycc_extxbgr_convert_neon
1273 * jsimd_ycc_extxrgb_convert_neon
1274 *
1275 * Colorspace conversion YCbCr -> RGB
1276 */
1277
1278
1279.macro do_load size
1280    .if \size == 8
1281        vld1.8  {d4}, [U, :64]!
1282        vld1.8  {d5}, [V, :64]!
1283        vld1.8  {d0}, [Y, :64]!
1284        pld     [U, #64]
1285        pld     [V, #64]
1286        pld     [Y, #64]
1287    .elseif \size == 4
1288        vld1.8  {d4[0]}, [U]!
1289        vld1.8  {d4[1]}, [U]!
1290        vld1.8  {d4[2]}, [U]!
1291        vld1.8  {d4[3]}, [U]!
1292        vld1.8  {d5[0]}, [V]!
1293        vld1.8  {d5[1]}, [V]!
1294        vld1.8  {d5[2]}, [V]!
1295        vld1.8  {d5[3]}, [V]!
1296        vld1.8  {d0[0]}, [Y]!
1297        vld1.8  {d0[1]}, [Y]!
1298        vld1.8  {d0[2]}, [Y]!
1299        vld1.8  {d0[3]}, [Y]!
1300    .elseif \size == 2
1301        vld1.8  {d4[4]}, [U]!
1302        vld1.8  {d4[5]}, [U]!
1303        vld1.8  {d5[4]}, [V]!
1304        vld1.8  {d5[5]}, [V]!
1305        vld1.8  {d0[4]}, [Y]!
1306        vld1.8  {d0[5]}, [Y]!
1307    .elseif \size == 1
1308        vld1.8  {d4[6]}, [U]!
1309        vld1.8  {d5[6]}, [V]!
1310        vld1.8  {d0[6]}, [Y]!
1311    .else
1312        .error unsupported macroblock size
1313    .endif
1314.endm
1315
1316.macro do_store bpp, size
1317    .if \bpp == 24
1318        .if \size == 8
1319            vst3.8  {d10, d11, d12}, [RGB]!
1320        .elseif \size == 4
1321            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
1322            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
1323            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
1324            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
1325        .elseif \size == 2
1326            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
1327            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
1328        .elseif \size == 1
1329            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
1330        .else
1331            .error unsupported macroblock size
1332        .endif
1333    .elseif \bpp == 32
1334        .if \size == 8
1335            vst4.8  {d10, d11, d12, d13}, [RGB]!
1336        .elseif \size == 4
1337            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1338            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1339            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1340            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1341        .elseif \size == 2
1342            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1343            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1344        .elseif \size == 1
1345            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1346        .else
1347            .error unsupported macroblock size
1348        .endif
1349    .else
1350        .error unsupported bpp
1351    .endif
1352.endm
1353
1354.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1355
1356/*
1357 * 2 stage pipelined YCbCr->RGB conversion
1358 */
1359
1360.macro do_yuv_to_rgb_stage1
1361    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
1362    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
1363    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
1364    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
1365    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
1366    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
1367    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
1368    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
1369    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
1370    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
1371.endm
1372
1373.macro do_yuv_to_rgb_stage2
1374    vrshrn.s32      d20, q10, #15
1375    vrshrn.s32      d21, q11, #15
1376    vrshrn.s32      d24, q12, #14
1377    vrshrn.s32      d25, q13, #14
1378    vrshrn.s32      d28, q14, #14
1379    vrshrn.s32      d29, q15, #14
1380    vaddw.u8        q10, q10, d0
1381    vaddw.u8        q12, q12, d0
1382    vaddw.u8        q14, q14, d0
1383    vqmovun.s16     d1\g_offs, q10
1384    vqmovun.s16     d1\r_offs, q12
1385    vqmovun.s16     d1\b_offs, q14
1386.endm
1387
1388.macro do_yuv_to_rgb_stage2_store_load_stage1
1389    vld1.8          {d4}, [U, :64]!
1390      vrshrn.s32      d20, q10, #15
1391      vrshrn.s32      d21, q11, #15
1392      vrshrn.s32      d24, q12, #14
1393      vrshrn.s32      d25, q13, #14
1394      vrshrn.s32      d28, q14, #14
1395    vld1.8          {d5}, [V, :64]!
1396      vrshrn.s32      d29, q15, #14
1397      vaddw.u8        q10, q10, d0
1398      vaddw.u8        q12, q12, d0
1399      vaddw.u8        q14, q14, d0
1400      vqmovun.s16     d1\g_offs, q10
1401    vld1.8          {d0}, [Y, :64]!
1402      vqmovun.s16     d1\r_offs, q12
1403    pld             [U, #64]
1404    pld             [V, #64]
1405    pld             [Y, #64]
1406      vqmovun.s16     d1\b_offs, q14
1407    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
1408    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
1409      do_store        \bpp, 8
1410    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
1411    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
1412    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
1413    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
1414    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
1415    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
1416    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
1417    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
1418.endm
1419
1420.macro do_yuv_to_rgb
1421    do_yuv_to_rgb_stage1
1422    do_yuv_to_rgb_stage2
1423.endm
1424
1425/* Apple gas crashes on adrl, work around that by using adr.
1426 * But this requires a copy of these constants for each function.
1427 */
1428
1429.balign 16
1430jsimd_ycc_\colorid\()_neon_consts:
1431    .short          0,      0,     0,      0
1432    .short          22971, -11277, -23401, 29033
1433    .short          -128,  -128,   -128,   -128
1434    .short          -128,  -128,   -128,   -128
1435
1436asm_function jsimd_ycc_\colorid\()_convert_neon
1437    OUTPUT_WIDTH    .req r0
1438    INPUT_BUF       .req r1
1439    INPUT_ROW       .req r2
1440    OUTPUT_BUF      .req r3
1441    NUM_ROWS        .req r4
1442
1443    INPUT_BUF0      .req r5
1444    INPUT_BUF1      .req r6
1445    INPUT_BUF2      .req INPUT_BUF
1446
1447    RGB             .req r7
1448    Y               .req r8
1449    U               .req r9
1450    V               .req r10
1451    N               .req ip
1452
1453    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1454    adr             ip, jsimd_ycc_\colorid\()_neon_consts
1455    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1456
1457    /* Save ARM registers and handle input arguments */
1458    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1459    ldr             NUM_ROWS, [sp, #(4 * 8)]
1460    ldr             INPUT_BUF0, [INPUT_BUF]
1461    ldr             INPUT_BUF1, [INPUT_BUF, #4]
1462    ldr             INPUT_BUF2, [INPUT_BUF, #8]
1463    .unreq          INPUT_BUF
1464
1465    /* Save NEON registers */
1466    vpush           {d8-d15}
1467
1468    /* Initially set d10, d11, d12, d13 to 0xFF */
1469    vmov.u8         q5, #255
1470    vmov.u8         q6, #255
1471
1472    /* Outer loop over scanlines */
1473    cmp             NUM_ROWS, #1
1474    blt             9f
14750:
1476    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1477    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1478    mov             N, OUTPUT_WIDTH
1479    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1480    add             INPUT_ROW, INPUT_ROW, #1
1481    ldr             RGB, [OUTPUT_BUF], #4
1482
1483    /* Inner loop over pixels */
1484    subs            N, N, #8
1485    blt             3f
1486    do_load         8
1487    do_yuv_to_rgb_stage1
1488    subs            N, N, #8
1489    blt             2f
14901:
1491    do_yuv_to_rgb_stage2_store_load_stage1
1492    subs            N, N, #8
1493    bge             1b
14942:
1495    do_yuv_to_rgb_stage2
1496    do_store        \bpp, 8
1497    tst             N, #7
1498    beq             8f
14993:
1500    tst             N, #4
1501    beq             3f
1502    do_load         4
15033:
1504    tst             N, #2
1505    beq             4f
1506    do_load         2
15074:
1508    tst             N, #1
1509    beq             5f
1510    do_load         1
15115:
1512    do_yuv_to_rgb
1513    tst             N, #4
1514    beq             6f
1515    do_store        \bpp, 4
15166:
1517    tst             N, #2
1518    beq             7f
1519    do_store        \bpp, 2
15207:
1521    tst             N, #1
1522    beq             8f
1523    do_store        \bpp, 1
15248:
1525    subs            NUM_ROWS, NUM_ROWS, #1
1526    bgt             0b
15279:
1528    /* Restore all registers and return */
1529    vpop            {d8-d15}
1530    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1531
1532    .unreq          OUTPUT_WIDTH
1533    .unreq          INPUT_ROW
1534    .unreq          OUTPUT_BUF
1535    .unreq          NUM_ROWS
1536    .unreq          INPUT_BUF0
1537    .unreq          INPUT_BUF1
1538    .unreq          INPUT_BUF2
1539    .unreq          RGB
1540    .unreq          Y
1541    .unreq          U
1542    .unreq          V
1543    .unreq          N
1544
1545.purgem do_yuv_to_rgb
1546.purgem do_yuv_to_rgb_stage1
1547.purgem do_yuv_to_rgb_stage2
1548.purgem do_yuv_to_rgb_stage2_store_load_stage1
1549
1550.endm
1551
1552/*--------------------------------- id ----- bpp R  G  B */
1553generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
1554generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
1555generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1556generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1557generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1558generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1559
1560.purgem do_load
1561.purgem do_store
1562
1563
1564/*****************************************************************************/
1565
1566/*
1567 * jsimd_extrgb_ycc_convert_neon
1568 * jsimd_extbgr_ycc_convert_neon
1569 * jsimd_extrgbx_ycc_convert_neon
1570 * jsimd_extbgrx_ycc_convert_neon
1571 * jsimd_extxbgr_ycc_convert_neon
1572 * jsimd_extxrgb_ycc_convert_neon
1573 *
1574 * Colorspace conversion RGB -> YCbCr
1575 */
1576
1577.macro do_store size
1578    .if \size == 8
1579        vst1.8  {d20}, [Y]!
1580        vst1.8  {d21}, [U]!
1581        vst1.8  {d22}, [V]!
1582    .elseif \size == 4
1583        vst1.8  {d20[0]}, [Y]!
1584        vst1.8  {d20[1]}, [Y]!
1585        vst1.8  {d20[2]}, [Y]!
1586        vst1.8  {d20[3]}, [Y]!
1587        vst1.8  {d21[0]}, [U]!
1588        vst1.8  {d21[1]}, [U]!
1589        vst1.8  {d21[2]}, [U]!
1590        vst1.8  {d21[3]}, [U]!
1591        vst1.8  {d22[0]}, [V]!
1592        vst1.8  {d22[1]}, [V]!
1593        vst1.8  {d22[2]}, [V]!
1594        vst1.8  {d22[3]}, [V]!
1595    .elseif \size == 2
1596        vst1.8  {d20[4]}, [Y]!
1597        vst1.8  {d20[5]}, [Y]!
1598        vst1.8  {d21[4]}, [U]!
1599        vst1.8  {d21[5]}, [U]!
1600        vst1.8  {d22[4]}, [V]!
1601        vst1.8  {d22[5]}, [V]!
1602    .elseif \size == 1
1603        vst1.8  {d20[6]}, [Y]!
1604        vst1.8  {d21[6]}, [U]!
1605        vst1.8  {d22[6]}, [V]!
1606    .else
1607        .error unsupported macroblock size
1608    .endif
1609.endm
1610
1611.macro do_load bpp, size
1612    .if \bpp == 24
1613        .if \size == 8
1614            vld3.8  {d10, d11, d12}, [RGB]!
1615            pld     [RGB, #128]
1616        .elseif \size == 4
1617            vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
1618            vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
1619            vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
1620            vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
1621        .elseif \size == 2
1622            vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
1623            vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
1624        .elseif \size == 1
1625            vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
1626        .else
1627            .error unsupported macroblock size
1628        .endif
1629    .elseif \bpp == 32
1630        .if \size == 8
1631            vld4.8  {d10, d11, d12, d13}, [RGB]!
1632            pld     [RGB, #128]
1633        .elseif \size == 4
1634            vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1635            vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1636            vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1637            vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1638        .elseif \size == 2
1639            vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1640            vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1641        .elseif \size == 1
1642            vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1643        .else
1644            .error unsupported macroblock size
1645        .endif
1646    .else
1647        .error unsupported bpp
1648    .endif
1649.endm
1650
1651.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1652
1653/*
1654 * 2 stage pipelined RGB->YCbCr conversion
1655 */
1656
1657.macro do_rgb_to_yuv_stage1
1658    vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
1659    vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
1660    vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
1661    vmull.u16   q7, d4, d0[0]
1662    vmlal.u16   q7, d6, d0[1]
1663    vmlal.u16   q7, d8, d0[2]
1664    vmull.u16   q8, d5, d0[0]
1665    vmlal.u16   q8, d7, d0[1]
1666    vmlal.u16   q8, d9, d0[2]
1667    vrev64.32   q9,  q1
1668    vrev64.32   q13, q1
1669    vmlsl.u16   q9,  d4, d0[3]
1670    vmlsl.u16   q9,  d6, d1[0]
1671    vmlal.u16   q9,  d8, d1[1]
1672    vmlsl.u16   q13, d5, d0[3]
1673    vmlsl.u16   q13, d7, d1[0]
1674    vmlal.u16   q13, d9, d1[1]
1675    vrev64.32   q14, q1
1676    vrev64.32   q15, q1
1677    vmlal.u16   q14, d4, d1[1]
1678    vmlsl.u16   q14, d6, d1[2]
1679    vmlsl.u16   q14, d8, d1[3]
1680    vmlal.u16   q15, d5, d1[1]
1681    vmlsl.u16   q15, d7, d1[2]
1682    vmlsl.u16   q15, d9, d1[3]
1683.endm
1684
1685.macro do_rgb_to_yuv_stage2
1686    vrshrn.u32  d20, q7,  #16
1687    vrshrn.u32  d21, q8,  #16
1688    vshrn.u32   d22, q9,  #16
1689    vshrn.u32   d23, q13, #16
1690    vshrn.u32   d24, q14, #16
1691    vshrn.u32   d25, q15, #16
1692    vmovn.u16   d20, q10      /* d20 = y */
1693    vmovn.u16   d21, q11      /* d21 = u */
1694    vmovn.u16   d22, q12      /* d22 = v */
1695.endm
1696
1697.macro do_rgb_to_yuv
1698    do_rgb_to_yuv_stage1
1699    do_rgb_to_yuv_stage2
1700.endm
1701
1702.macro do_rgb_to_yuv_stage2_store_load_stage1
1703      vrshrn.u32  d20, q7,  #16
1704      vrshrn.u32  d21, q8,  #16
1705      vshrn.u32   d22, q9,  #16
1706    vrev64.32   q9,  q1
1707      vshrn.u32   d23, q13, #16
1708    vrev64.32   q13, q1
1709      vshrn.u32   d24, q14, #16
1710      vshrn.u32   d25, q15, #16
1711    do_load     \bpp, 8
1712      vmovn.u16   d20, q10      /* d20 = y */
1713    vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
1714      vmovn.u16   d21, q11      /* d21 = u */
1715    vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
1716      vmovn.u16   d22, q12      /* d22 = v */
1717    vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
1718    vmull.u16   q7, d4, d0[0]
1719    vmlal.u16   q7, d6, d0[1]
1720    vmlal.u16   q7, d8, d0[2]
1721      vst1.8      {d20}, [Y]!
1722    vmull.u16   q8, d5, d0[0]
1723    vmlal.u16   q8, d7, d0[1]
1724    vmlal.u16   q8, d9, d0[2]
1725    vmlsl.u16   q9,  d4, d0[3]
1726    vmlsl.u16   q9,  d6, d1[0]
1727    vmlal.u16   q9,  d8, d1[1]
1728      vst1.8      {d21}, [U]!
1729    vmlsl.u16   q13, d5, d0[3]
1730    vmlsl.u16   q13, d7, d1[0]
1731    vmlal.u16   q13, d9, d1[1]
1732    vrev64.32   q14, q1
1733    vrev64.32   q15, q1
1734    vmlal.u16   q14, d4, d1[1]
1735    vmlsl.u16   q14, d6, d1[2]
1736    vmlsl.u16   q14, d8, d1[3]
1737      vst1.8      {d22}, [V]!
1738    vmlal.u16   q15, d5, d1[1]
1739    vmlsl.u16   q15, d7, d1[2]
1740    vmlsl.u16   q15, d9, d1[3]
1741.endm
1742
1743.balign 16
1744jsimd_\colorid\()_ycc_neon_consts:
1745    .short          19595, 38470, 7471,  11059
1746    .short          21709, 32768, 27439, 5329
1747    .short          32767, 128,   32767, 128
1748    .short          32767, 128,   32767, 128
1749
1750asm_function jsimd_\colorid\()_ycc_convert_neon
1751    OUTPUT_WIDTH    .req r0
1752    INPUT_BUF       .req r1
1753    OUTPUT_BUF      .req r2
1754    OUTPUT_ROW      .req r3
1755    NUM_ROWS        .req r4
1756
1757    OUTPUT_BUF0     .req r5
1758    OUTPUT_BUF1     .req r6
1759    OUTPUT_BUF2     .req OUTPUT_BUF
1760
1761    RGB             .req r7
1762    Y               .req r8
1763    U               .req r9
1764    V               .req r10
1765    N               .req ip
1766
1767    /* Load constants to d0, d1, d2, d3 */
1768    adr             ip, jsimd_\colorid\()_ycc_neon_consts
1769    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1770
1771    /* Save ARM registers and handle input arguments */
1772    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1773    ldr             NUM_ROWS, [sp, #(4 * 8)]
1774    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
1775    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
1776    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
1777    .unreq          OUTPUT_BUF
1778
1779    /* Save NEON registers */
1780    vpush           {d8-d15}
1781
1782    /* Outer loop over scanlines */
1783    cmp             NUM_ROWS, #1
1784    blt             9f
17850:
1786    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1787    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1788    mov             N, OUTPUT_WIDTH
1789    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1790    add             OUTPUT_ROW, OUTPUT_ROW, #1
1791    ldr             RGB, [INPUT_BUF], #4
1792
1793    /* Inner loop over pixels */
1794    subs            N, N, #8
1795    blt             3f
1796    do_load         \bpp, 8
1797    do_rgb_to_yuv_stage1
1798    subs            N, N, #8
1799    blt             2f
18001:
1801    do_rgb_to_yuv_stage2_store_load_stage1
1802    subs            N, N, #8
1803    bge             1b
18042:
1805    do_rgb_to_yuv_stage2
1806    do_store        8
1807    tst             N, #7
1808    beq             8f
18093:
1810    tst             N, #4
1811    beq             3f
1812    do_load         \bpp, 4
18133:
1814    tst             N, #2
1815    beq             4f
1816    do_load         \bpp, 2
18174:
1818    tst             N, #1
1819    beq             5f
1820    do_load         \bpp, 1
18215:
1822    do_rgb_to_yuv
1823    tst             N, #4
1824    beq             6f
1825    do_store        4
18266:
1827    tst             N, #2
1828    beq             7f
1829    do_store        2
18307:
1831    tst             N, #1
1832    beq             8f
1833    do_store        1
18348:
1835    subs            NUM_ROWS, NUM_ROWS, #1
1836    bgt             0b
18379:
1838    /* Restore all registers and return */
1839    vpop            {d8-d15}
1840    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1841
1842    .unreq          OUTPUT_WIDTH
1843    .unreq          OUTPUT_ROW
1844    .unreq          INPUT_BUF
1845    .unreq          NUM_ROWS
1846    .unreq          OUTPUT_BUF0
1847    .unreq          OUTPUT_BUF1
1848    .unreq          OUTPUT_BUF2
1849    .unreq          RGB
1850    .unreq          Y
1851    .unreq          U
1852    .unreq          V
1853    .unreq          N
1854
1855.purgem do_rgb_to_yuv
1856.purgem do_rgb_to_yuv_stage1
1857.purgem do_rgb_to_yuv_stage2
1858.purgem do_rgb_to_yuv_stage2_store_load_stage1
1859
1860.endm
1861
1862/*--------------------------------- id ----- bpp R  G  B */
1863generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
1864generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
1865generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1866generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1867generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1868generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1869
1870.purgem do_load
1871.purgem do_store
1872
1873
1874/*****************************************************************************/
1875
1876/*
1877 * Load data into workspace, applying unsigned->signed conversion
1878 *
1879 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1880 *       rid of VST1.16 instructions
1881 */
1882
1883asm_function jsimd_convsamp_neon
1884    SAMPLE_DATA     .req r0
1885    START_COL       .req r1
1886    WORKSPACE       .req r2
1887    TMP1            .req r3
1888    TMP2            .req r4
1889    TMP3            .req r5
1890    TMP4            .req ip
1891
1892    push            {r4, r5}
1893    vmov.u8         d0, #128
1894
1895    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1896    add             TMP1, TMP1, START_COL
1897    add             TMP2, TMP2, START_COL
1898    add             TMP3, TMP3, START_COL
1899    add             TMP4, TMP4, START_COL
1900    vld1.8          {d16}, [TMP1]
1901    vsubl.u8        q8, d16, d0
1902    vld1.8          {d18}, [TMP2]
1903    vsubl.u8        q9, d18, d0
1904    vld1.8          {d20}, [TMP3]
1905    vsubl.u8        q10, d20, d0
1906    vld1.8          {d22}, [TMP4]
1907    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1908    vsubl.u8        q11, d22, d0
1909    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
1910    add             TMP1, TMP1, START_COL
1911    add             TMP2, TMP2, START_COL
1912    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
1913    add             TMP3, TMP3, START_COL
1914    add             TMP4, TMP4, START_COL
1915    vld1.8          {d24}, [TMP1]
1916    vsubl.u8        q12, d24, d0
1917    vld1.8          {d26}, [TMP2]
1918    vsubl.u8        q13, d26, d0
1919    vld1.8          {d28}, [TMP3]
1920    vsubl.u8        q14, d28, d0
1921    vld1.8          {d30}, [TMP4]
1922    vsubl.u8        q15, d30, d0
1923    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
1924    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
1925    pop             {r4, r5}
1926    bx              lr
1927
1928    .unreq          SAMPLE_DATA
1929    .unreq          START_COL
1930    .unreq          WORKSPACE
1931    .unreq          TMP1
1932    .unreq          TMP2
1933    .unreq          TMP3
1934    .unreq          TMP4
1935
1936
1937/*****************************************************************************/
1938
1939/*
1940 * jsimd_fdct_ifast_neon
1941 *
1942 * This function contains a fast, not so accurate integer implementation of
1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1945 * function from jfdctfst.c
1946 *
1947 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1948 *       rid of a bunch of VLD1.16 instructions
1949 */
1950
1951#define XFIX_0_382683433 d0[0]
1952#define XFIX_0_541196100 d0[1]
1953#define XFIX_0_707106781 d0[2]
1954#define XFIX_1_306562965 d0[3]
1955
1956.balign 16
1957jsimd_fdct_ifast_neon_consts:
1958    .short (98 * 128)              /* XFIX_0_382683433 */
1959    .short (139 * 128)             /* XFIX_0_541196100 */
1960    .short (181 * 128)             /* XFIX_0_707106781 */
1961    .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
1962
1963asm_function jsimd_fdct_ifast_neon
1964
1965    DATA            .req r0
1966    TMP             .req ip
1967
1968    vpush           {d8-d15}
1969
1970    /* Load constants */
1971    adr             TMP, jsimd_fdct_ifast_neon_consts
1972    vld1.16         {d0}, [TMP, :64]
1973
1974    /* Load all DATA into NEON registers with the following allocation:
1975     *       0 1 2 3 | 4 5 6 7
1976     *      ---------+--------
1977     *   0 | d16     | d17    | q8
1978     *   1 | d18     | d19    | q9
1979     *   2 | d20     | d21    | q10
1980     *   3 | d22     | d23    | q11
1981     *   4 | d24     | d25    | q12
1982     *   5 | d26     | d27    | q13
1983     *   6 | d28     | d29    | q14
1984     *   7 | d30     | d31    | q15
1985     */
1986
1987    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
1988    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
1989    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
1990    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
1991    sub             DATA, DATA, #(128 - 32)
1992
1993    mov             TMP, #2
19941:
1995    /* Transpose */
1996    vtrn.16         q12, q13
1997    vtrn.16         q10, q11
1998    vtrn.16         q8,  q9
1999    vtrn.16         q14, q15
2000    vtrn.32         q9,  q11
2001    vtrn.32         q13, q15
2002    vtrn.32         q8,  q10
2003    vtrn.32         q12, q14
2004    vswp            d30, d23
2005    vswp            d24, d17
2006    vswp            d26, d19
2007      /* 1-D FDCT */
2008      vadd.s16        q2,  q11, q12
2009    vswp            d28, d21
2010      vsub.s16        q12, q11, q12
2011      vsub.s16        q6,  q10, q13
2012      vadd.s16        q10, q10, q13
2013      vsub.s16        q7,  q9,  q14
2014      vadd.s16        q9,  q9,  q14
2015      vsub.s16        q1,  q8,  q15
2016      vadd.s16        q8,  q8,  q15
2017      vsub.s16        q4,  q9,  q10
2018      vsub.s16        q5,  q8,  q2
2019      vadd.s16        q3,  q9,  q10
2020      vadd.s16        q4,  q4,  q5
2021      vadd.s16        q2,  q8,  q2
2022      vqdmulh.s16     q4,  q4,  XFIX_0_707106781
2023      vadd.s16        q11, q12, q6
2024      vadd.s16        q8,  q2,  q3
2025      vsub.s16        q12, q2,  q3
2026      vadd.s16        q3,  q6,  q7
2027      vadd.s16        q7,  q7,  q1
2028      vqdmulh.s16     q3,  q3,  XFIX_0_707106781
2029      vsub.s16        q6,  q11, q7
2030      vadd.s16        q10, q5,  q4
2031      vqdmulh.s16     q6,  q6,  XFIX_0_382683433
2032      vsub.s16        q14, q5,  q4
2033      vqdmulh.s16     q11, q11, XFIX_0_541196100
2034      vqdmulh.s16     q5,  q7,  XFIX_1_306562965
2035      vadd.s16        q4,  q1,  q3
2036      vsub.s16        q3,  q1,  q3
2037      vadd.s16        q7,  q7,  q6
2038      vadd.s16        q11, q11, q6
2039      vadd.s16        q7,  q7,  q5
2040      vadd.s16        q13, q3,  q11
2041      vsub.s16        q11, q3,  q11
2042      vadd.s16        q9,  q4,  q7
2043      vsub.s16        q15, q4,  q7
2044    subs            TMP, TMP, #1
2045    bne             1b
2046
2047    /* store results */
2048    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
2049    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
2050    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
2051    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
2052
2053    vpop            {d8-d15}
2054    bx              lr
2055
2056    .unreq          DATA
2057    .unreq          TMP
2058
2059
2060/*****************************************************************************/
2061
2062/*
2063 * GLOBAL(void)
2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
2065 *                      DCTELEM * workspace);
2066 *
2067 * Note: the code uses 2 stage pipelining in order to improve instructions
2068 *       scheduling and eliminate stalls (this provides ~15% better
2069 *       performance for this function on both ARM Cortex-A8 and
2070 *       ARM Cortex-A9 when compared to the non-pipelined variant).
2071 *       The instructions which belong to the second stage use different
2072 *       indentation for better readiability.
2073 */
2074asm_function jsimd_quantize_neon
2075
2076    COEF_BLOCK      .req r0
2077    DIVISORS        .req r1
2078    WORKSPACE       .req r2
2079
2080    RECIPROCAL      .req DIVISORS
2081    CORRECTION      .req r3
2082    SHIFT           .req ip
2083    LOOP_COUNT      .req r4
2084
2085    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
2086    vabs.s16        q12, q0
2087    add             CORRECTION, DIVISORS, #(64 * 2)
2088    add             SHIFT, DIVISORS, #(64 * 6)
2089    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
2090    vabs.s16        q13, q1
2091    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2092    vadd.u16        q12, q12, q10 /* add correction */
2093    vadd.u16        q13, q13, q11
2094    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
2095    vmull.u16       q11, d25, d17
2096    vmull.u16       q8,  d26, d18
2097    vmull.u16       q9,  d27, d19
2098    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
2099    vshrn.u32       d20, q10, #16
2100    vshrn.u32       d21, q11, #16
2101    vshrn.u32       d22, q8,  #16
2102    vshrn.u32       d23, q9,  #16
2103    vneg.s16        q12, q12
2104    vneg.s16        q13, q13
2105    vshr.s16        q2,  q0,  #15 /* extract sign */
2106    vshr.s16        q3,  q1,  #15
2107    vshl.u16        q14, q10, q12 /* shift */
2108    vshl.u16        q15, q11, q13
2109
2110    push            {r4, r5}
2111    mov             LOOP_COUNT, #3
21121:
2113    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
2114      veor.u16        q14, q14, q2  /* restore sign */
2115    vabs.s16        q12, q0
2116    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
2117    vabs.s16        q13, q1
2118      veor.u16        q15, q15, q3
2119    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2120    vadd.u16        q12, q12, q10 /* add correction */
2121    vadd.u16        q13, q13, q11
2122    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
2123    vmull.u16       q11, d25, d17
2124    vmull.u16       q8,  d26, d18
2125    vmull.u16       q9,  d27, d19
2126      vsub.u16        q14, q14, q2
2127    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
2128      vsub.u16        q15, q15, q3
2129    vshrn.u32       d20, q10, #16
2130    vshrn.u32       d21, q11, #16
2131      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2132    vshrn.u32       d22, q8,  #16
2133    vshrn.u32       d23, q9,  #16
2134    vneg.s16        q12, q12
2135    vneg.s16        q13, q13
2136    vshr.s16        q2,  q0,  #15 /* extract sign */
2137    vshr.s16        q3,  q1,  #15
2138    vshl.u16        q14, q10, q12 /* shift */
2139    vshl.u16        q15, q11, q13
2140    subs            LOOP_COUNT, LOOP_COUNT, #1
2141    bne             1b
2142    pop             {r4, r5}
2143
2144      veor.u16        q14, q14, q2  /* restore sign */
2145      veor.u16        q15, q15, q3
2146      vsub.u16        q14, q14, q2
2147      vsub.u16        q15, q15, q3
2148      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2149
2150    bx              lr /* return */
2151
2152    .unreq          COEF_BLOCK
2153    .unreq          DIVISORS
2154    .unreq          WORKSPACE
2155    .unreq          RECIPROCAL
2156    .unreq          CORRECTION
2157    .unreq          SHIFT
2158    .unreq          LOOP_COUNT
2159
2160
2161/*****************************************************************************/
2162
2163/*
2164 * GLOBAL(void)
2165 * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
2166 *                                 JDIMENSION   downsampled_width,
2167 *                                 JSAMPARRAY   input_data,
2168 *                                 JSAMPARRAY * output_data_ptr);
2169 *
2170 * Note: the use of unaligned writes is the main remaining bottleneck in
2171 *       this code, which can be potentially solved to get up to tens
2172 *       of percents performance improvement on Cortex-A8/Cortex-A9.
2173 */
2174
2175/*
2176 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2177 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2178 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2179 * Register d28 is used for multiplication by 3. Register q15 is used
2180 * for adding +1 bias.
2181 */
2182.macro upsample16   OUTPTR, INPTR
2183    vld1.8          {q0}, [\INPTR]!
2184    vmovl.u8        q8,  d0
2185    vext.8          q2,  q1,  q0, #15
2186    vmovl.u8        q9,  d1
2187    vaddw.u8        q10, q15, d4
2188    vaddw.u8        q11, q15, d5
2189    vmlal.u8        q8,  d4,  d28
2190    vmlal.u8        q9,  d5,  d28
2191    vmlal.u8        q10, d0,  d28
2192    vmlal.u8        q11, d1,  d28
2193    vmov            q1,  q0       /* backup source pixels to q1 */
2194    vrshrn.u16      d6,  q8,  #2
2195    vrshrn.u16      d7,  q9,  #2
2196    vshrn.u16       d8,  q10, #2
2197    vshrn.u16       d9,  q11, #2
2198    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2199.endm
2200
2201/*
2202 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2203 * macro, the roles of q0 and q1 registers are reversed for even and odd
2204 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2205 * Also this unrolling allows to reorder loads and stores to compensate
2206 * multiplication latency and reduce stalls.
2207 */
2208.macro upsample32   OUTPTR, INPTR
2209    /* even 16 pixels group */
2210    vld1.8          {q0}, [\INPTR]!
2211    vmovl.u8        q8,  d0
2212    vext.8          q2,  q1,  q0, #15
2213    vmovl.u8        q9,  d1
2214    vaddw.u8        q10, q15, d4
2215    vaddw.u8        q11, q15, d5
2216    vmlal.u8        q8,  d4,  d28
2217    vmlal.u8        q9,  d5,  d28
2218    vmlal.u8        q10, d0,  d28
2219    vmlal.u8        q11, d1,  d28
2220        /* odd 16 pixels group */
2221        vld1.8          {q1}, [\INPTR]!
2222    vrshrn.u16      d6,  q8,  #2
2223    vrshrn.u16      d7,  q9,  #2
2224    vshrn.u16       d8,  q10, #2
2225    vshrn.u16       d9,  q11, #2
2226        vmovl.u8        q8,  d2
2227        vext.8          q2,  q0,  q1, #15
2228        vmovl.u8        q9,  d3
2229        vaddw.u8        q10, q15, d4
2230        vaddw.u8        q11, q15, d5
2231        vmlal.u8        q8,  d4,  d28
2232        vmlal.u8        q9,  d5,  d28
2233        vmlal.u8        q10, d2,  d28
2234        vmlal.u8        q11, d3,  d28
2235    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2236        vrshrn.u16      d6,  q8,  #2
2237        vrshrn.u16      d7,  q9,  #2
2238        vshrn.u16       d8,  q10, #2
2239        vshrn.u16       d9,  q11, #2
2240        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2241.endm
2242
2243/*
2244 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2245 */
2246.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2247    /* special case for the first and last pixels */
2248    sub             \WIDTH, \WIDTH, #1
2249    add             \OUTPTR, \OUTPTR, #1
2250    ldrb            \TMP1, [\INPTR, \WIDTH]
2251    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
2252    ldrb            \TMP1, [\INPTR], #1
2253    strb            \TMP1, [\OUTPTR, #-1]
2254    vmov.8          d3[7], \TMP1
2255
2256    subs            \WIDTH, \WIDTH, #32
2257    blt             5f
22580:  /* process 32 pixels per iteration */
2259    upsample32      \OUTPTR, \INPTR
2260    subs            \WIDTH, \WIDTH, #32
2261    bge             0b
22625:
2263    adds            \WIDTH, \WIDTH, #16
2264    blt             1f
22650:  /* process 16 pixels if needed */
2266    upsample16      \OUTPTR, \INPTR
2267    subs            \WIDTH, \WIDTH, #16
22681:
2269    adds            \WIDTH, \WIDTH, #16
2270    beq             9f
2271
2272    /* load the remaining 1-15 pixels */
2273    add             \INPTR, \INPTR, \WIDTH
2274    tst             \WIDTH, #1
2275    beq             2f
2276    sub             \INPTR, \INPTR, #1
2277    vld1.8          {d0[0]}, [\INPTR]
22782:
2279    tst             \WIDTH, #2
2280    beq             2f
2281    vext.8          d0, d0, d0, #6
2282    sub             \INPTR, \INPTR, #1
2283    vld1.8          {d0[1]}, [\INPTR]
2284    sub             \INPTR, \INPTR, #1
2285    vld1.8          {d0[0]}, [\INPTR]
22862:
2287    tst             \WIDTH, #4
2288    beq             2f
2289    vrev64.32       d0, d0
2290    sub             \INPTR, \INPTR, #1
2291    vld1.8          {d0[3]}, [\INPTR]
2292    sub             \INPTR, \INPTR, #1
2293    vld1.8          {d0[2]}, [\INPTR]
2294    sub             \INPTR, \INPTR, #1
2295    vld1.8          {d0[1]}, [\INPTR]
2296    sub             \INPTR, \INPTR, #1
2297    vld1.8          {d0[0]}, [\INPTR]
22982:
2299    tst             \WIDTH, #8
2300    beq             2f
2301    vmov            d1,  d0
2302    sub             \INPTR, \INPTR, #8
2303    vld1.8          {d0}, [\INPTR]
23042:  /* upsample the remaining pixels */
2305    vmovl.u8        q8,  d0
2306    vext.8          q2,  q1,  q0, #15
2307    vmovl.u8        q9,  d1
2308    vaddw.u8        q10, q15, d4
2309    vaddw.u8        q11, q15, d5
2310    vmlal.u8        q8,  d4,  d28
2311    vmlal.u8        q9,  d5,  d28
2312    vmlal.u8        q10, d0,  d28
2313    vmlal.u8        q11, d1,  d28
2314    vrshrn.u16      d10, q8,  #2
2315    vrshrn.u16      d12, q9,  #2
2316    vshrn.u16       d11, q10, #2
2317    vshrn.u16       d13, q11, #2
2318    vzip.8          d10, d11
2319    vzip.8          d12, d13
2320    /* store the remaining pixels */
2321    tst             \WIDTH, #8
2322    beq             2f
2323    vst1.8          {d10, d11}, [\OUTPTR]!
2324    vmov            q5,  q6
23252:
2326    tst             \WIDTH, #4
2327    beq             2f
2328    vst1.8          {d10}, [\OUTPTR]!
2329    vmov            d10,  d11
23302:
2331    tst             \WIDTH, #2
2332    beq             2f
2333    vst1.8          {d10[0]}, [\OUTPTR]!
2334    vst1.8          {d10[1]}, [\OUTPTR]!
2335    vst1.8          {d10[2]}, [\OUTPTR]!
2336    vst1.8          {d10[3]}, [\OUTPTR]!
2337    vext.8          d10, d10, d10, #4
23382:
2339    tst             \WIDTH, #1
2340    beq             2f
2341    vst1.8          {d10[0]}, [\OUTPTR]!
2342    vst1.8          {d10[1]}, [\OUTPTR]!
23432:
23449:
2345.endm
2346
2347asm_function jsimd_h2v1_fancy_upsample_neon
2348
2349    MAX_V_SAMP_FACTOR .req r0
2350    DOWNSAMPLED_WIDTH .req r1
2351    INPUT_DATA        .req r2
2352    OUTPUT_DATA_PTR   .req r3
2353    OUTPUT_DATA       .req OUTPUT_DATA_PTR
2354
2355    OUTPTR            .req r4
2356    INPTR             .req r5
2357    WIDTH             .req ip
2358    TMP               .req lr
2359
2360    push            {r4, r5, r6, lr}
2361    vpush           {d8-d15}
2362
2363    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
2364    cmp             MAX_V_SAMP_FACTOR, #0
2365    ble             99f
2366
2367    /* initialize constants */
2368    vmov.u8         d28, #3
2369    vmov.u16        q15, #1
237011:
2371    ldr             INPTR, [INPUT_DATA], #4
2372    ldr             OUTPTR, [OUTPUT_DATA], #4
2373    mov             WIDTH, DOWNSAMPLED_WIDTH
2374    upsample_row    OUTPTR, INPTR, WIDTH, TMP
2375    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2376    bgt             11b
2377
237899:
2379    vpop            {d8-d15}
2380    pop             {r4, r5, r6, pc}
2381
2382    .unreq          MAX_V_SAMP_FACTOR
2383    .unreq          DOWNSAMPLED_WIDTH
2384    .unreq          INPUT_DATA
2385    .unreq          OUTPUT_DATA_PTR
2386    .unreq          OUTPUT_DATA
2387
2388    .unreq          OUTPTR
2389    .unreq          INPTR
2390    .unreq          WIDTH
2391    .unreq          TMP
2392
2393
2394.purgem upsample16
2395.purgem upsample32
2396.purgem upsample_row
2397