1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC Neon
19#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21// Read 8 Y, 4 U and 4 V from 422
22#define READYUV422                                                             \
23    MEMACCESS(0)                                                               \
24    "vld1.8     {d0}, [%0]!                    \n"                             \
25    MEMACCESS(1)                                                               \
26    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
27    MEMACCESS(2)                                                               \
28    "vld1.32    {d2[1]}, [%2]!                 \n"
29
30// Read 8 Y, 2 U and 2 V from 422
31#define READYUV411                                                             \
32    MEMACCESS(0)                                                               \
33    "vld1.8     {d0}, [%0]!                    \n"                             \
34    MEMACCESS(1)                                                               \
35    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
36    MEMACCESS(2)                                                               \
37    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
38    "vmov.u8    d3, d2                         \n"                             \
39    "vzip.u8    d2, d3                         \n"
40
41// Read 8 Y, 8 U and 8 V from 444
42#define READYUV444                                                             \
43    MEMACCESS(0)                                                               \
44    "vld1.8     {d0}, [%0]!                    \n"                             \
45    MEMACCESS(1)                                                               \
46    "vld1.8     {d2}, [%1]!                    \n"                             \
47    MEMACCESS(2)                                                               \
48    "vld1.8     {d3}, [%2]!                    \n"                             \
49    "vpaddl.u8  q1, q1                         \n"                             \
50    "vrshrn.u16 d2, q1, #1                     \n"
51
52// Read 8 Y, and set 4 U and 4 V to 128
53#define READYUV400                                                             \
54    MEMACCESS(0)                                                               \
55    "vld1.8     {d0}, [%0]!                    \n"                             \
56    "vmov.u8    d2, #128                       \n"
57
58// Read 8 Y and 4 UV from NV12
59#define READNV12                                                               \
60    MEMACCESS(0)                                                               \
61    "vld1.8     {d0}, [%0]!                    \n"                             \
62    MEMACCESS(1)                                                               \
63    "vld1.8     {d2}, [%1]!                    \n"                             \
64    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
65    "vuzp.u8    d2, d3                         \n"                             \
66    "vtrn.u32   d2, d3                         \n"
67
68// Read 8 Y and 4 VU from NV21
69#define READNV21                                                               \
70    MEMACCESS(0)                                                               \
71    "vld1.8     {d0}, [%0]!                    \n"                             \
72    MEMACCESS(1)                                                               \
73    "vld1.8     {d2}, [%1]!                    \n"                             \
74    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
75    "vuzp.u8    d3, d2                         \n"                             \
76    "vtrn.u32   d2, d3                         \n"
77
78// Read 8 YUY2
79#define READYUY2                                                               \
80    MEMACCESS(0)                                                               \
81    "vld2.8     {d0, d2}, [%0]!                \n"                             \
82    "vmov.u8    d3, d2                         \n"                             \
83    "vuzp.u8    d2, d3                         \n"                             \
84    "vtrn.u32   d2, d3                         \n"
85
86// Read 8 UYVY
87#define READUYVY                                                               \
88    MEMACCESS(0)                                                               \
89    "vld2.8     {d2, d3}, [%0]!                \n"                             \
90    "vmov.u8    d0, d3                         \n"                             \
91    "vmov.u8    d3, d2                         \n"                             \
92    "vuzp.u8    d2, d3                         \n"                             \
93    "vtrn.u32   d2, d3                         \n"
94
95#define YUV422TORGB                                                            \
96    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
97    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
98    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
99    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
100    "vtrn.u8    d0, d1                         \n"                             \
101    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
102    "vmul.s16   q0, q0, q14                    \n"                             \
103    "vadd.s16   d18, d19                       \n"                             \
104    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \
105    "vqadd.s16  d21, d1, d16                   \n"                             \
106    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \
107    "vqadd.s16  d23, d1, d17                   \n"                             \
108    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \
109    "vqadd.s16  d17, d1, d18                   \n"                             \
110    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \
111    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \
112    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \
113    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
114    "vmovl.u8   q11, d1                        \n"                             \
115    "vmovl.u8   q8, d2                         \n"                             \
116    "vtrn.u8    d20, d21                       \n"                             \
117    "vtrn.u8    d22, d23                       \n"                             \
118    "vtrn.u8    d16, d17                       \n"                             \
119    "vmov.u8    d21, d16                       \n"
120
121static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
122                         0, 0, 0, 0, 0, 0, 0, 0 };
123static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
124                       0, 0, 0, 0, 0, 0, 0, 0 };
125
126#ifdef HAS_I444TOARGBROW_NEON
127void I444ToARGBRow_NEON(const uint8* src_y,
128                        const uint8* src_u,
129                        const uint8* src_v,
130                        uint8* dst_argb,
131                        int width) {
132  asm volatile (
133    MEMACCESS(5)
134    "vld1.8     {d24}, [%5]                    \n"
135    MEMACCESS(6)
136    "vld1.8     {d25}, [%6]                    \n"
137    "vmov.u8    d26, #128                      \n"
138    "vmov.u16   q14, #74                       \n"
139    "vmov.u16   q15, #16                       \n"
140    ".p2align   2                              \n"
141  "1:                                          \n"
142    READYUV444
143    YUV422TORGB
144    "subs       %4, %4, #8                     \n"
145    "vmov.u8    d23, #255                      \n"
146    MEMACCESS(3)
147    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
148    "bgt        1b                             \n"
149    : "+r"(src_y),     // %0
150      "+r"(src_u),     // %1
151      "+r"(src_v),     // %2
152      "+r"(dst_argb),  // %3
153      "+r"(width)      // %4
154    : "r"(&kUVToRB),   // %5
155      "r"(&kUVToG)     // %6
156    : "cc", "memory", "q0", "q1", "q2", "q3",
157      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
158  );
159}
160#endif  // HAS_I444TOARGBROW_NEON
161
162#ifdef HAS_I422TOARGBROW_NEON
163void I422ToARGBRow_NEON(const uint8* src_y,
164                        const uint8* src_u,
165                        const uint8* src_v,
166                        uint8* dst_argb,
167                        int width) {
168  asm volatile (
169    MEMACCESS(5)
170    "vld1.8     {d24}, [%5]                    \n"
171    MEMACCESS(6)
172    "vld1.8     {d25}, [%6]                    \n"
173    "vmov.u8    d26, #128                      \n"
174    "vmov.u16   q14, #74                       \n"
175    "vmov.u16   q15, #16                       \n"
176    ".p2align   2                              \n"
177  "1:                                          \n"
178    READYUV422
179    YUV422TORGB
180    "subs       %4, %4, #8                     \n"
181    "vmov.u8    d23, #255                      \n"
182    MEMACCESS(3)
183    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
184    "bgt        1b                             \n"
185    : "+r"(src_y),     // %0
186      "+r"(src_u),     // %1
187      "+r"(src_v),     // %2
188      "+r"(dst_argb),  // %3
189      "+r"(width)      // %4
190    : "r"(&kUVToRB),   // %5
191      "r"(&kUVToG)     // %6
192    : "cc", "memory", "q0", "q1", "q2", "q3",
193      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
194  );
195}
196#endif  // HAS_I422TOARGBROW_NEON
197
198#ifdef HAS_I411TOARGBROW_NEON
199void I411ToARGBRow_NEON(const uint8* src_y,
200                        const uint8* src_u,
201                        const uint8* src_v,
202                        uint8* dst_argb,
203                        int width) {
204  asm volatile (
205    MEMACCESS(5)
206    "vld1.8     {d24}, [%5]                    \n"
207    MEMACCESS(6)
208    "vld1.8     {d25}, [%6]                    \n"
209    "vmov.u8    d26, #128                      \n"
210    "vmov.u16   q14, #74                       \n"
211    "vmov.u16   q15, #16                       \n"
212    ".p2align   2                              \n"
213  "1:                                          \n"
214    READYUV411
215    YUV422TORGB
216    "subs       %4, %4, #8                     \n"
217    "vmov.u8    d23, #255                      \n"
218    MEMACCESS(3)
219    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
220    "bgt        1b                             \n"
221    : "+r"(src_y),     // %0
222      "+r"(src_u),     // %1
223      "+r"(src_v),     // %2
224      "+r"(dst_argb),  // %3
225      "+r"(width)      // %4
226    : "r"(&kUVToRB),   // %5
227      "r"(&kUVToG)     // %6
228    : "cc", "memory", "q0", "q1", "q2", "q3",
229      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
230  );
231}
232#endif  // HAS_I411TOARGBROW_NEON
233
234#ifdef HAS_I422TOBGRAROW_NEON
235void I422ToBGRARow_NEON(const uint8* src_y,
236                        const uint8* src_u,
237                        const uint8* src_v,
238                        uint8* dst_bgra,
239                        int width) {
240  asm volatile (
241    MEMACCESS(5)
242    "vld1.8     {d24}, [%5]                    \n"
243    MEMACCESS(6)
244    "vld1.8     {d25}, [%6]                    \n"
245    "vmov.u8    d26, #128                      \n"
246    "vmov.u16   q14, #74                       \n"
247    "vmov.u16   q15, #16                       \n"
248    ".p2align   2                              \n"
249  "1:                                          \n"
250    READYUV422
251    YUV422TORGB
252    "subs       %4, %4, #8                     \n"
253    "vswp.u8    d20, d22                       \n"
254    "vmov.u8    d19, #255                      \n"
255    MEMACCESS(3)
256    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
257    "bgt        1b                             \n"
258    : "+r"(src_y),     // %0
259      "+r"(src_u),     // %1
260      "+r"(src_v),     // %2
261      "+r"(dst_bgra),  // %3
262      "+r"(width)      // %4
263    : "r"(&kUVToRB),   // %5
264      "r"(&kUVToG)     // %6
265    : "cc", "memory", "q0", "q1", "q2", "q3",
266      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
267  );
268}
269#endif  // HAS_I422TOBGRAROW_NEON
270
271#ifdef HAS_I422TOABGRROW_NEON
272void I422ToABGRRow_NEON(const uint8* src_y,
273                        const uint8* src_u,
274                        const uint8* src_v,
275                        uint8* dst_abgr,
276                        int width) {
277  asm volatile (
278    MEMACCESS(5)
279    "vld1.8     {d24}, [%5]                    \n"
280    MEMACCESS(6)
281    "vld1.8     {d25}, [%6]                    \n"
282    "vmov.u8    d26, #128                      \n"
283    "vmov.u16   q14, #74                       \n"
284    "vmov.u16   q15, #16                       \n"
285    ".p2align   2                              \n"
286  "1:                                          \n"
287    READYUV422
288    YUV422TORGB
289    "subs       %4, %4, #8                     \n"
290    "vswp.u8    d20, d22                       \n"
291    "vmov.u8    d23, #255                      \n"
292    MEMACCESS(3)
293    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
294    "bgt        1b                             \n"
295    : "+r"(src_y),     // %0
296      "+r"(src_u),     // %1
297      "+r"(src_v),     // %2
298      "+r"(dst_abgr),  // %3
299      "+r"(width)      // %4
300    : "r"(&kUVToRB),   // %5
301      "r"(&kUVToG)     // %6
302    : "cc", "memory", "q0", "q1", "q2", "q3",
303      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
304  );
305}
306#endif  // HAS_I422TOABGRROW_NEON
307
308#ifdef HAS_I422TORGBAROW_NEON
309void I422ToRGBARow_NEON(const uint8* src_y,
310                        const uint8* src_u,
311                        const uint8* src_v,
312                        uint8* dst_rgba,
313                        int width) {
314  asm volatile (
315    MEMACCESS(5)
316    "vld1.8     {d24}, [%5]                    \n"
317    MEMACCESS(6)
318    "vld1.8     {d25}, [%6]                    \n"
319    "vmov.u8    d26, #128                      \n"
320    "vmov.u16   q14, #74                       \n"
321    "vmov.u16   q15, #16                       \n"
322    ".p2align   2                              \n"
323  "1:                                          \n"
324    READYUV422
325    YUV422TORGB
326    "subs       %4, %4, #8                     \n"
327    "vmov.u8    d19, #255                      \n"
328    MEMACCESS(3)
329    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
330    "bgt        1b                             \n"
331    : "+r"(src_y),     // %0
332      "+r"(src_u),     // %1
333      "+r"(src_v),     // %2
334      "+r"(dst_rgba),  // %3
335      "+r"(width)      // %4
336    : "r"(&kUVToRB),   // %5
337      "r"(&kUVToG)     // %6
338    : "cc", "memory", "q0", "q1", "q2", "q3",
339      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
340  );
341}
342#endif  // HAS_I422TORGBAROW_NEON
343
344#ifdef HAS_I422TORGB24ROW_NEON
345void I422ToRGB24Row_NEON(const uint8* src_y,
346                         const uint8* src_u,
347                         const uint8* src_v,
348                         uint8* dst_rgb24,
349                         int width) {
350  asm volatile (
351    MEMACCESS(5)
352    "vld1.8     {d24}, [%5]                    \n"
353    MEMACCESS(6)
354    "vld1.8     {d25}, [%6]                    \n"
355    "vmov.u8    d26, #128                      \n"
356    "vmov.u16   q14, #74                       \n"
357    "vmov.u16   q15, #16                       \n"
358    ".p2align   2                              \n"
359  "1:                                          \n"
360    READYUV422
361    YUV422TORGB
362    "subs       %4, %4, #8                     \n"
363    MEMACCESS(3)
364    "vst3.8     {d20, d21, d22}, [%3]!         \n"
365    "bgt        1b                             \n"
366    : "+r"(src_y),      // %0
367      "+r"(src_u),      // %1
368      "+r"(src_v),      // %2
369      "+r"(dst_rgb24),  // %3
370      "+r"(width)       // %4
371    : "r"(&kUVToRB),    // %5
372      "r"(&kUVToG)      // %6
373    : "cc", "memory", "q0", "q1", "q2", "q3",
374      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
375  );
376}
377#endif  // HAS_I422TORGB24ROW_NEON
378
379#ifdef HAS_I422TORAWROW_NEON
380void I422ToRAWRow_NEON(const uint8* src_y,
381                       const uint8* src_u,
382                       const uint8* src_v,
383                       uint8* dst_raw,
384                       int width) {
385  asm volatile (
386    MEMACCESS(5)
387    "vld1.8     {d24}, [%5]                    \n"
388    MEMACCESS(6)
389    "vld1.8     {d25}, [%6]                    \n"
390    "vmov.u8    d26, #128                      \n"
391    "vmov.u16   q14, #74                       \n"
392    "vmov.u16   q15, #16                       \n"
393    ".p2align   2                              \n"
394  "1:                                          \n"
395    READYUV422
396    YUV422TORGB
397    "subs       %4, %4, #8                     \n"
398    "vswp.u8    d20, d22                       \n"
399    MEMACCESS(3)
400    "vst3.8     {d20, d21, d22}, [%3]!         \n"
401    "bgt        1b                             \n"
402    : "+r"(src_y),    // %0
403      "+r"(src_u),    // %1
404      "+r"(src_v),    // %2
405      "+r"(dst_raw),  // %3
406      "+r"(width)     // %4
407    : "r"(&kUVToRB),  // %5
408      "r"(&kUVToG)    // %6
409    : "cc", "memory", "q0", "q1", "q2", "q3",
410      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
411  );
412}
413#endif  // HAS_I422TORAWROW_NEON
414
415#define ARGBTORGB565                                                           \
416    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
417    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
418    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
419    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
420    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
421    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
422    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
423    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
424    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
425    "vorr       q0, q0, q10                    \n"  /* BGR                  */
426
427#ifdef HAS_I422TORGB565ROW_NEON
428void I422ToRGB565Row_NEON(const uint8* src_y,
429                          const uint8* src_u,
430                          const uint8* src_v,
431                          uint8* dst_rgb565,
432                          int width) {
433  asm volatile (
434    MEMACCESS(5)
435    "vld1.8     {d24}, [%5]                    \n"
436    MEMACCESS(6)
437    "vld1.8     {d25}, [%6]                    \n"
438    "vmov.u8    d26, #128                      \n"
439    "vmov.u16   q14, #74                       \n"
440    "vmov.u16   q15, #16                       \n"
441    ".p2align   2                              \n"
442  "1:                                          \n"
443    READYUV422
444    YUV422TORGB
445    "subs       %4, %4, #8                     \n"
446    ARGBTORGB565
447    MEMACCESS(3)
448    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
449    "bgt        1b                             \n"
450    : "+r"(src_y),    // %0
451      "+r"(src_u),    // %1
452      "+r"(src_v),    // %2
453      "+r"(dst_rgb565),  // %3
454      "+r"(width)     // %4
455    : "r"(&kUVToRB),  // %5
456      "r"(&kUVToG)    // %6
457    : "cc", "memory", "q0", "q1", "q2", "q3",
458      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
459  );
460}
461#endif  // HAS_I422TORGB565ROW_NEON
462
463#define ARGBTOARGB1555                                                         \
464    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
465    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
466    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
467    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
468    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
469    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
470    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
471    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
472    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
473    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
474    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
475    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
476    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
477
478#ifdef HAS_I422TOARGB1555ROW_NEON
479void I422ToARGB1555Row_NEON(const uint8* src_y,
480                            const uint8* src_u,
481                            const uint8* src_v,
482                            uint8* dst_argb1555,
483                            int width) {
484  asm volatile (
485    MEMACCESS(5)
486    "vld1.8     {d24}, [%5]                    \n"
487    MEMACCESS(6)
488    "vld1.8     {d25}, [%6]                    \n"
489    "vmov.u8    d26, #128                      \n"
490    "vmov.u16   q14, #74                       \n"
491    "vmov.u16   q15, #16                       \n"
492    ".p2align   2                              \n"
493  "1:                                          \n"
494    READYUV422
495    YUV422TORGB
496    "subs       %4, %4, #8                     \n"
497    "vmov.u8    d23, #255                      \n"
498    ARGBTOARGB1555
499    MEMACCESS(3)
500    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
501    "bgt        1b                             \n"
502    : "+r"(src_y),    // %0
503      "+r"(src_u),    // %1
504      "+r"(src_v),    // %2
505      "+r"(dst_argb1555),  // %3
506      "+r"(width)     // %4
507    : "r"(&kUVToRB),  // %5
508      "r"(&kUVToG)    // %6
509    : "cc", "memory", "q0", "q1", "q2", "q3",
510      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
511  );
512}
513#endif  // HAS_I422TOARGB1555ROW_NEON
514
515#define ARGBTOARGB4444                                                         \
516    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
517    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
518    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
519    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
520    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
521    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
522    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
523
524#ifdef HAS_I422TOARGB4444ROW_NEON
525void I422ToARGB4444Row_NEON(const uint8* src_y,
526                            const uint8* src_u,
527                            const uint8* src_v,
528                            uint8* dst_argb4444,
529                            int width) {
530  asm volatile (
531    MEMACCESS(5)
532    "vld1.8     {d24}, [%5]                    \n"
533    MEMACCESS(6)
534    "vld1.8     {d25}, [%6]                    \n"
535    "vmov.u8    d26, #128                      \n"
536    "vmov.u16   q14, #74                       \n"
537    "vmov.u16   q15, #16                       \n"
538    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
539    ".p2align   2                              \n"
540  "1:                                          \n"
541    READYUV422
542    YUV422TORGB
543    "subs       %4, %4, #8                     \n"
544    "vmov.u8    d23, #255                      \n"
545    ARGBTOARGB4444
546    MEMACCESS(3)
547    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
548    "bgt        1b                             \n"
549    : "+r"(src_y),    // %0
550      "+r"(src_u),    // %1
551      "+r"(src_v),    // %2
552      "+r"(dst_argb4444),  // %3
553      "+r"(width)     // %4
554    : "r"(&kUVToRB),  // %5
555      "r"(&kUVToG)    // %6
556    : "cc", "memory", "q0", "q1", "q2", "q3",
557      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558  );
559}
560#endif  // HAS_I422TOARGB4444ROW_NEON
561
562#ifdef HAS_YTOARGBROW_NEON
563void YToARGBRow_NEON(const uint8* src_y,
564                     uint8* dst_argb,
565                     int width) {
566  asm volatile (
567    MEMACCESS(3)
568    "vld1.8     {d24}, [%3]                    \n"
569    MEMACCESS(4)
570    "vld1.8     {d25}, [%4]                    \n"
571    "vmov.u8    d26, #128                      \n"
572    "vmov.u16   q14, #74                       \n"
573    "vmov.u16   q15, #16                       \n"
574    ".p2align   2                              \n"
575  "1:                                          \n"
576    READYUV400
577    YUV422TORGB
578    "subs       %2, %2, #8                     \n"
579    "vmov.u8    d23, #255                      \n"
580    MEMACCESS(1)
581    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
582    "bgt        1b                             \n"
583    : "+r"(src_y),     // %0
584      "+r"(dst_argb),  // %1
585      "+r"(width)      // %2
586    : "r"(&kUVToRB),   // %3
587      "r"(&kUVToG)     // %4
588    : "cc", "memory", "q0", "q1", "q2", "q3",
589      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
590  );
591}
592#endif  // HAS_YTOARGBROW_NEON
593
594#ifdef HAS_I400TOARGBROW_NEON
595void I400ToARGBRow_NEON(const uint8* src_y,
596                        uint8* dst_argb,
597                        int width) {
598  asm volatile (
599    ".p2align   2                              \n"
600    "vmov.u8    d23, #255                      \n"
601  "1:                                          \n"
602    MEMACCESS(0)
603    "vld1.8     {d20}, [%0]!                   \n"
604    "vmov       d21, d20                       \n"
605    "vmov       d22, d20                       \n"
606    "subs       %2, %2, #8                     \n"
607    MEMACCESS(1)
608    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
609    "bgt        1b                             \n"
610    : "+r"(src_y),     // %0
611      "+r"(dst_argb),  // %1
612      "+r"(width)      // %2
613    :
614    : "cc", "memory", "d20", "d21", "d22", "d23"
615  );
616}
617#endif  // HAS_I400TOARGBROW_NEON
618
619#ifdef HAS_NV12TOARGBROW_NEON
620void NV12ToARGBRow_NEON(const uint8* src_y,
621                        const uint8* src_uv,
622                        uint8* dst_argb,
623                        int width) {
624  asm volatile (
625    MEMACCESS(4)
626    "vld1.8     {d24}, [%4]                    \n"
627    MEMACCESS(5)
628    "vld1.8     {d25}, [%5]                    \n"
629    "vmov.u8    d26, #128                      \n"
630    "vmov.u16   q14, #74                       \n"
631    "vmov.u16   q15, #16                       \n"
632    ".p2align   2                              \n"
633  "1:                                          \n"
634    READNV12
635    YUV422TORGB
636    "subs       %3, %3, #8                     \n"
637    "vmov.u8    d23, #255                      \n"
638    MEMACCESS(2)
639    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
640    "bgt        1b                             \n"
641    : "+r"(src_y),     // %0
642      "+r"(src_uv),    // %1
643      "+r"(dst_argb),  // %2
644      "+r"(width)      // %3
645    : "r"(&kUVToRB),   // %4
646      "r"(&kUVToG)     // %5
647    : "cc", "memory", "q0", "q1", "q2", "q3",
648      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
649  );
650}
651#endif  // HAS_NV12TOARGBROW_NEON
652
653#ifdef HAS_NV21TOARGBROW_NEON
654void NV21ToARGBRow_NEON(const uint8* src_y,
655                        const uint8* src_uv,
656                        uint8* dst_argb,
657                        int width) {
658  asm volatile (
659    MEMACCESS(4)
660    "vld1.8     {d24}, [%4]                    \n"
661    MEMACCESS(5)
662    "vld1.8     {d25}, [%5]                    \n"
663    "vmov.u8    d26, #128                      \n"
664    "vmov.u16   q14, #74                       \n"
665    "vmov.u16   q15, #16                       \n"
666    ".p2align   2                              \n"
667  "1:                                          \n"
668    READNV21
669    YUV422TORGB
670    "subs       %3, %3, #8                     \n"
671    "vmov.u8    d23, #255                      \n"
672    MEMACCESS(2)
673    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
674    "bgt        1b                             \n"
675    : "+r"(src_y),     // %0
676      "+r"(src_uv),    // %1
677      "+r"(dst_argb),  // %2
678      "+r"(width)      // %3
679    : "r"(&kUVToRB),   // %4
680      "r"(&kUVToG)     // %5
681    : "cc", "memory", "q0", "q1", "q2", "q3",
682      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
683  );
684}
685#endif  // HAS_NV21TOARGBROW_NEON
686
687#ifdef HAS_NV12TORGB565ROW_NEON
688void NV12ToRGB565Row_NEON(const uint8* src_y,
689                          const uint8* src_uv,
690                          uint8* dst_rgb565,
691                          int width) {
692  asm volatile (
693    MEMACCESS(4)
694    "vld1.8     {d24}, [%4]                    \n"
695    MEMACCESS(5)
696    "vld1.8     {d25}, [%5]                    \n"
697    "vmov.u8    d26, #128                      \n"
698    "vmov.u16   q14, #74                       \n"
699    "vmov.u16   q15, #16                       \n"
700    ".p2align   2                              \n"
701  "1:                                          \n"
702    READNV12
703    YUV422TORGB
704    "subs       %3, %3, #8                     \n"
705    ARGBTORGB565
706    MEMACCESS(2)
707    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
708    "bgt        1b                             \n"
709    : "+r"(src_y),     // %0
710      "+r"(src_uv),    // %1
711      "+r"(dst_rgb565),  // %2
712      "+r"(width)      // %3
713    : "r"(&kUVToRB),   // %4
714      "r"(&kUVToG)     // %5
715    : "cc", "memory", "q0", "q1", "q2", "q3",
716      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
717  );
718}
719#endif  // HAS_NV12TORGB565ROW_NEON
720
721#ifdef HAS_NV21TORGB565ROW_NEON
722void NV21ToRGB565Row_NEON(const uint8* src_y,
723                          const uint8* src_uv,
724                          uint8* dst_rgb565,
725                          int width) {
726  asm volatile (
727    MEMACCESS(4)
728    "vld1.8     {d24}, [%4]                    \n"
729    MEMACCESS(5)
730    "vld1.8     {d25}, [%5]                    \n"
731    "vmov.u8    d26, #128                      \n"
732    "vmov.u16   q14, #74                       \n"
733    "vmov.u16   q15, #16                       \n"
734    ".p2align   2                              \n"
735  "1:                                          \n"
736    READNV21
737    YUV422TORGB
738    "subs       %3, %3, #8                     \n"
739    ARGBTORGB565
740    MEMACCESS(2)
741    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
742    "bgt        1b                             \n"
743    : "+r"(src_y),     // %0
744      "+r"(src_uv),    // %1
745      "+r"(dst_rgb565),  // %2
746      "+r"(width)      // %3
747    : "r"(&kUVToRB),   // %4
748      "r"(&kUVToG)     // %5
749    : "cc", "memory", "q0", "q1", "q2", "q3",
750      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
751  );
752}
753#endif  // HAS_NV21TORGB565ROW_NEON
754
755#ifdef HAS_YUY2TOARGBROW_NEON
756void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
757                        uint8* dst_argb,
758                        int width) {
759  asm volatile (
760    MEMACCESS(3)
761    "vld1.8     {d24}, [%3]                    \n"
762    MEMACCESS(4)
763    "vld1.8     {d25}, [%4]                    \n"
764    "vmov.u8    d26, #128                      \n"
765    "vmov.u16   q14, #74                       \n"
766    "vmov.u16   q15, #16                       \n"
767    ".p2align   2                              \n"
768  "1:                                          \n"
769    READYUY2
770    YUV422TORGB
771    "subs       %2, %2, #8                     \n"
772    "vmov.u8    d23, #255                      \n"
773    MEMACCESS(1)
774    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
775    "bgt        1b                             \n"
776    : "+r"(src_yuy2),  // %0
777      "+r"(dst_argb),  // %1
778      "+r"(width)      // %2
779    : "r"(&kUVToRB),   // %3
780      "r"(&kUVToG)     // %4
781    : "cc", "memory", "q0", "q1", "q2", "q3",
782      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
783  );
784}
785#endif  // HAS_YUY2TOARGBROW_NEON
786
787#ifdef HAS_UYVYTOARGBROW_NEON
788void UYVYToARGBRow_NEON(const uint8* src_uyvy,
789                        uint8* dst_argb,
790                        int width) {
791  asm volatile (
792    MEMACCESS(3)
793    "vld1.8     {d24}, [%3]                    \n"
794    MEMACCESS(4)
795    "vld1.8     {d25}, [%4]                    \n"
796    "vmov.u8    d26, #128                      \n"
797    "vmov.u16   q14, #74                       \n"
798    "vmov.u16   q15, #16                       \n"
799    ".p2align   2                              \n"
800  "1:                                          \n"
801    READUYVY
802    YUV422TORGB
803    "subs       %2, %2, #8                     \n"
804    "vmov.u8    d23, #255                      \n"
805    MEMACCESS(1)
806    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
807    "bgt        1b                             \n"
808    : "+r"(src_uyvy),  // %0
809      "+r"(dst_argb),  // %1
810      "+r"(width)      // %2
811    : "r"(&kUVToRB),   // %3
812      "r"(&kUVToG)     // %4
813    : "cc", "memory", "q0", "q1", "q2", "q3",
814      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
815  );
816}
817#endif  // HAS_UYVYTOARGBROW_NEON
818
819// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
820#ifdef HAS_SPLITUVROW_NEON
821void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
822                     int width) {
823  asm volatile (
824    ".p2align   2                              \n"
825  "1:                                          \n"
826    MEMACCESS(0)
827    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
828    "subs       %3, %3, #16                    \n"  // 16 processed per loop
829    MEMACCESS(1)
830    "st1        {v0.16b}, [%1], #16            \n"  // store U
831    MEMACCESS(2)
832    "st1        {v1.16b}, [%2], #16            \n"  // store V
833    "bgt        1b                             \n"
834    : "+r"(src_uv),  // %0
835      "+r"(dst_u),   // %1
836      "+r"(dst_v),   // %2
837      "+r"(width)    // %3  // Output registers
838    :                       // Input registers
839    : "cc", "memory", "v0", "v1"  // Clobber List
840  );
841}
842#endif  // HAS_SPLITUVROW_NEON
843
844// Reads 16 U's and V's and writes out 16 pairs of UV.
845#ifdef HAS_MERGEUVROW_NEON
846void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
847                     int width) {
848  asm volatile (
849    ".p2align   2                              \n"
850  "1:                                          \n"
851    MEMACCESS(0)
852    "ld1        {v0.16b}, [%0], #16            \n"  // load U
853    MEMACCESS(1)
854    "ld1        {v1.16b}, [%1], #16            \n"  // load V
855    "subs       %3, %3, #16                    \n"  // 16 processed per loop
856    MEMACCESS(2)
857    "st2        {v0.16b, v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
858    "bgt        1b                             \n"
859    :
860      "+r"(src_u),   // %0
861      "+r"(src_v),   // %1
862      "+r"(dst_uv),  // %2
863      "+r"(width)    // %3  // Output registers
864    :                       // Input registers
865    : "cc", "memory", "v0", "v1"  // Clobber List
866  );
867}
868#endif  // HAS_MERGEUVROW_NEON
869
870// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
871#ifdef HAS_COPYROW_NEON
872void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
873  asm volatile (
874    ".p2align   2                              \n"
875  "1:                                          \n"
876    MEMACCESS(0)
877    "ld1        {v0.8b-v3.8b}, [%0], #32       \n"  // load 32
878    "subs       %2, %2, #32                    \n"  // 32 processed per loop
879    MEMACCESS(1)
880    "st1        {v0.8b-v3.8b}, [%1], #32       \n"  // store 32
881    "bgt        1b                             \n"
882  : "+r"(src),   // %0
883    "+r"(dst),   // %1
884    "+r"(count)  // %2  // Output registers
885  :                     // Input registers
886  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
887  );
888}
889#endif  // HAS_COPYROW_NEON
890
891// SetRow8 writes 'count' bytes using a 32 bit value repeated.
892#ifdef HAS_SETROW_NEON
893void SetRow_NEON(uint8* dst, uint32 v32, int count) {
894  asm volatile (
895    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
896    "1:                                        \n"
897    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
898    MEMACCESS(0)
899    "st1        {v0.16b}, [%0], #16            \n"  // store
900    "bgt       1b                              \n"
901  : "+r"(dst),   // %0
902    "+r"(count)  // %1
903  : "r"(v32)     // %2
904  : "cc", "memory", "v0"
905  );
906}
907#endif  // HAS_SETROW_NEON
908
909// TODO(fbarchard): Make fully assembler
910// SetRow32 writes 'count' words using a 32 bit value repeated.
911#ifdef HAS_ARGBSETROWS_NEON
912void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
913                      int dst_stride, int height) {
914  for (int y = 0; y < height; ++y) {
915    SetRow_NEON(dst, v32, width << 2);
916    dst += dst_stride;
917  }
918}
919#endif  // HAS_ARGBSETROWS_NEON
920
921#ifdef HAS_MIRRORROW_NEON
922void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
923  asm volatile (
924    // Start at end of source row.
925    "add        %0, %0, %2                     \n"
926    "sub        %0, %0, #16                    \n"
927
928    ".p2align   2                              \n"
929  "1:                                          \n"
930    MEMACCESS(0)
931    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
932    "subs       %2, %2, #16                    \n"  // 16 pixels per loop.
933    "rev64      v0.16b, v0.16b                 \n"
934    MEMACCESS(1)
935    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
936    MEMACCESS(1)
937    "st1        {v0.D}[0], [%1], #8            \n"
938    "bgt        1b                             \n"
939  : "+r"(src),   // %0
940    "+r"(dst),   // %1
941    "+r"(width)  // %2
942  : "r"((ptrdiff_t)-16)    // %3
943  : "cc", "memory", "v0"
944  );
945}
946#endif  // HAS_MIRRORROW_NEON
947
948#ifdef HAS_MIRRORUVROW_NEON
949void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
950                      int width) {
951  asm volatile (
952    // Start at end of source row.
953    "add        %0, %0, %3, lsl #1             \n"
954    "sub        %0, %0, #16                    \n"
955
956    ".p2align   2                              \n"
957  "1:                                          \n"
958    MEMACCESS(0)
959    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
960    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
961    "rev64      v0.8b, v0.8b                   \n"
962    "rev64      v1.8b, v1.8b                   \n"
963    MEMACCESS(1)
964    "st1        {v0.8b}, [%1], #8               \n"  // dst += 8
965    MEMACCESS(2)
966    "st1        {v1.8b}, [%2], #8               \n"
967    "bgt        1b                             \n"
968  : "+r"(src_uv),  // %0
969    "+r"(dst_u),   // %1
970    "+r"(dst_v),   // %2
971    "+r"(width)    // %3
972  : "r"((ptrdiff_t)-16)      // %4
973  : "cc", "memory", "v0", "v1"
974  );
975}
976#endif  // HAS_MIRRORUVROW_NEON
977
978#ifdef HAS_ARGBMIRRORROW_NEON
979void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
980  asm volatile (
981    // Start at end of source row.
982    "add        %0, %0, %2, lsl #2             \n"
983    "sub        %0, %0, #16                    \n"
984
985    ".p2align   2                              \n"
986  "1:                                          \n"
987    MEMACCESS(0)
988    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
989    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
990    "rev64      v0.4s, v0.4s                   \n"
991    MEMACCESS(1)
992    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
993    MEMACCESS(1)
994    "st1        {v0.D}[0], [%1], #8            \n"
995    "bgt        1b                             \n"
996  : "+r"(src),   // %0
997    "+r"(dst),   // %1
998    "+r"(width)  // %2
999  : "r"((ptrdiff_t)-16)    // %3
1000  : "cc", "memory", "v0"
1001  );
1002}
1003#endif  // HAS_ARGBMIRRORROW_NEON
1004
1005#ifdef HAS_RGB24TOARGBROW_NEON
1006void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
1007  asm volatile (
1008    "movi       v4.8b, #255                    \n"  // Alpha
1009    ".p2align   2                              \n"
1010  "1:                                          \n"
1011    MEMACCESS(0)
1012    "ld3        {v1.8b-v3.8b}, [%0], #24       \n"  // load 8 pixels of RGB24.
1013    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1014    MEMACCESS(1)
1015    "st4        {v1.8b-v4.8b}, [%1], #32       \n"  // store 8 pixels of ARGB.
1016    "bgt        1b                             \n"
1017  : "+r"(src_rgb24),  // %0
1018    "+r"(dst_argb),   // %1
1019    "+r"(pix)         // %2
1020  :
1021  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
1022  );
1023}
1024#endif  // HAS_RGB24TOARGBROW_NEON
1025
1026#ifdef HAS_RAWTOARGBROW_NEON
1027void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
1028  asm volatile (
1029    "movi       v5.8b, #255                    \n"  // Alpha
1030    ".p2align   2                              \n"
1031  "1:                                          \n"
1032    MEMACCESS(0)
1033    "ld3        {v0.8b-v2.8b}, [%0], #24       \n"  // read r g b
1034    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1035    "mov        v3.8b, v1.8b                   \n"  // move g
1036    "mov        v4.8b, v0.8b                   \n"  // move r
1037    MEMACCESS(1)
1038    "st4        {v2.8b-v5.8b}, [%1], #32       \n"  // store b g r a
1039    "bgt        1b                             \n"
1040  : "+r"(src_raw),   // %0
1041    "+r"(dst_argb),  // %1
1042    "+r"(pix)        // %2
1043  :
1044  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
1045  );
1046}
1047#endif  // HAS_RAWTOARGBROW_NEON
1048
1049#define RGB565TOARGB                                                           \
1050    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
1051    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
1052    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
1053    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
1054    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
1055    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
1056    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
1057    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
1058    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
1059    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
1060
1061#ifdef HAS_RGB565TOARGBROW_NEON
1062void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
1063  asm volatile (
1064    "vmov.u8    d3, #255                       \n"  // Alpha
1065    ".p2align   2                              \n"
1066  "1:                                          \n"
1067    MEMACCESS(0)
1068    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
1069    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1070    RGB565TOARGB
1071    MEMACCESS(1)
1072    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
1073    "bgt        1b                             \n"
1074  : "+r"(src_rgb565),  // %0
1075    "+r"(dst_argb),    // %1
1076    "+r"(pix)          // %2
1077  :
1078  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
1079  );
1080}
1081#endif  // HAS_RGB565TOARGBROW_NEON
1082
1083#define ARGB1555TOARGB                                                         \
1084    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
1085    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
1086    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
1087    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
1088    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
1089    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
1090    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
1091    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
1092    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
1093    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
1094    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
1095    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
1096
1097// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1098#define RGB555TOARGB                                                           \
1099    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
1100    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
1101    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
1102    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
1103    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
1104    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
1105    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
1106    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
1107    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
1108    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
1109
1110#ifdef HAS_ARGB1555TOARGBROW_NEON
1111void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
1112                            int pix) {
1113  asm volatile (
1114    "vmov.u8    d3, #255                       \n"  // Alpha
1115    ".p2align   2                              \n"
1116  "1:                                          \n"
1117    MEMACCESS(0)
1118    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
1119    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1120    ARGB1555TOARGB
1121    MEMACCESS(1)
1122    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
1123    "bgt        1b                             \n"
1124  : "+r"(src_argb1555),  // %0
1125    "+r"(dst_argb),    // %1
1126    "+r"(pix)          // %2
1127  :
1128  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
1129  );
1130}
1131#endif  // HAS_ARGB1555TOARGBROW_NEON
1132
1133#define ARGB4444TOARGB                                                         \
1134    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
1135    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
1136    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
1137    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
1138    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
1139    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
1140    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
1141    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
1142
1143#ifdef HAS_ARGB4444TOARGBROW_NEON
1144void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1145                            int pix) {
1146  asm volatile (
1147    "vmov.u8    d3, #255                       \n"  // Alpha
1148    ".p2align   2                              \n"
1149  "1:                                          \n"
1150    MEMACCESS(0)
1151    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
1152    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1153    ARGB4444TOARGB
1154    MEMACCESS(1)
1155    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
1156    "bgt        1b                             \n"
1157  : "+r"(src_argb4444),  // %0
1158    "+r"(dst_argb),    // %1
1159    "+r"(pix)          // %2
1160  :
1161  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
1162  );
1163}
1164#endif  // HAS_ARGB4444TOARGBROW_NEON
1165
1166#ifdef HAS_ARGBTORGB24ROW_NEON
1167void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
1168  asm volatile (
1169    ".p2align   2                              \n"
1170  "1:                                          \n"
1171    MEMACCESS(0)
1172    "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load 8 pixels of ARGB.
1173    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1174    MEMACCESS(1)
1175    "st3        {v1.8b-v3.8b}, [%1], #24       \n"  // store 8 pixels of RGB24.
1176    "bgt        1b                             \n"
1177  : "+r"(src_argb),   // %0
1178    "+r"(dst_rgb24),  // %1
1179    "+r"(pix)         // %2
1180  :
1181  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
1182  );
1183}
1184#endif  // HAS_ARGBTORGB24ROW_NEON
1185
1186#ifdef HAS_ARGBTORAWROW_NEON
1187void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
1188  asm volatile (
1189    ".p2align   2                              \n"
1190  "1:                                          \n"
1191    MEMACCESS(0)
1192    "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load b g r a
1193    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1194    "mov        v4.8b, v2.8b                   \n"  // mov g
1195    "mov        v5.8b, v1.8b                   \n"  // mov b
1196    MEMACCESS(1)
1197    "st3        {v3.8b-v5.8b}, [%1], #24       \n"  // store r g b
1198    "bgt        1b                             \n"
1199  : "+r"(src_argb),  // %0
1200    "+r"(dst_raw),   // %1
1201    "+r"(pix)        // %2
1202  :
1203  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
1204  );
1205}
1206#endif  // HAS_ARGBTORAWROW_NEON
1207
1208#ifdef HAS_YUY2TOYROW_NEON
1209void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
1210  asm volatile (
1211    ".p2align   2                              \n"
1212  "1:                                          \n"
1213    MEMACCESS(0)
1214    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
1215    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
1216    MEMACCESS(1)
1217    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1218    "bgt        1b                             \n"
1219  : "+r"(src_yuy2),  // %0
1220    "+r"(dst_y),     // %1
1221    "+r"(pix)        // %2
1222  :
1223  : "cc", "memory", "v0", "v1"  // Clobber List
1224  );
1225}
1226#endif  // HAS_YUY2TOYROW_NEON
1227
1228#ifdef HAS_UYVYTOYROW_NEON
1229void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
1230  asm volatile (
1231    ".p2align   2                              \n"
1232  "1:                                          \n"
1233    MEMACCESS(0)
1234    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
1235    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
1236    MEMACCESS(1)
1237    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1238    "bgt        1b                             \n"
1239  : "+r"(src_uyvy),  // %0
1240    "+r"(dst_y),     // %1
1241    "+r"(pix)        // %2
1242  :
1243  : "cc", "memory", "v0", "v1"  // Clobber List
1244  );
1245}
1246#endif  // HAS_UYVYTOYROW_NEON
1247
1248#ifdef HAS_YUY2TOUV422ROW_NEON
1249void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1250                         int pix) {
1251  asm volatile (
1252    ".p2align   2                              \n"
1253  "1:                                          \n"
1254    MEMACCESS(0)
1255    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2.
1256    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1257    MEMACCESS(1)
1258    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
1259    MEMACCESS(2)
1260    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
1261    "bgt        1b                             \n"
1262  : "+r"(src_yuy2),  // %0
1263    "+r"(dst_u),     // %1
1264    "+r"(dst_v),     // %2
1265    "+r"(pix)        // %3
1266  :
1267  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1268  );
1269}
1270#endif  // HAS_YUY2TOUV422ROW_NEON
1271
1272#ifdef HAS_UYVYTOUV422ROW_NEON
1273void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1274                         int pix) {
1275  asm volatile (
1276    ".p2align   2                              \n"
1277  "1:                                          \n"
1278    MEMACCESS(0)
1279    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY.
1280    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1281    MEMACCESS(1)
1282    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
1283    MEMACCESS(2)
1284    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
1285    "bgt        1b                             \n"
1286  : "+r"(src_uyvy),  // %0
1287    "+r"(dst_u),     // %1
1288    "+r"(dst_v),     // %2
1289    "+r"(pix)        // %3
1290  :
1291  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1292  );
1293}
1294#endif  // HAS_UYVYTOUV422ROW_NEON
1295
1296#ifdef HAS_YUY2TOUVROW_NEON
1297void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1298                      uint8* dst_u, uint8* dst_v, int pix) {
1299  asm volatile (
1300    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_yuy2
1301    ".p2align   2                              \n"
1302  "1:                                          \n"
1303    MEMACCESS(0)
1304    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2.
1305    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1306    MEMACCESS(1)
1307    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row YUY2.
1308    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
1309    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
1310    MEMACCESS(2)
1311    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
1312    MEMACCESS(3)
1313    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
1314    "bgt        1b                             \n"
1315  : "+r"(src_yuy2),     // %0
1316    "+r"(stride_yuy2),  // %1
1317    "+r"(dst_u),        // %2
1318    "+r"(dst_v),        // %3
1319    "+r"(pix)           // %4
1320  :
1321  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber List
1322  );
1323}
1324#endif  // HAS_YUY2TOUVROW_NEON
1325
1326#ifdef HAS_UYVYTOUVROW_NEON
1327void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1328                      uint8* dst_u, uint8* dst_v, int pix) {
1329  asm volatile (
1330    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_uyvy
1331    ".p2align   2                              \n"
1332  "1:                                          \n"
1333    MEMACCESS(0)
1334    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY.
1335    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1336    MEMACCESS(1)
1337    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row UYVY.
1338    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
1339    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
1340    MEMACCESS(2)
1341    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
1342    MEMACCESS(3)
1343    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
1344    "bgt        1b                             \n"
1345  : "+r"(src_uyvy),     // %0
1346    "+r"(stride_uyvy),  // %1
1347    "+r"(dst_u),        // %2
1348    "+r"(dst_v),        // %3
1349    "+r"(pix)           // %4
1350  :
1351  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber List
1352  );
1353}
1354#endif  // HAS_UYVYTOUVROW_NEON
1355
1356#ifdef HAS_HALFROW_NEON
1357void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
1358                  uint8* dst_uv, int pix) {
1359  asm volatile (
1360    // change the stride to row 2 pointer
1361    "add        %x1, %x0, %w1, sxtw            \n"
1362  "1:                                          \n"
1363    MEMACCESS(0)
1364    "ld1        {v0.16b}, [%0], #16            \n"  // load row 1 16 pixels.
1365    "subs       %3, %3, #16                    \n"  // 16 processed per loop
1366    MEMACCESS(1)
1367    "ld1        {v1.16b}, [%1], #16            \n"  // load row 2 16 pixels.
1368    "urhadd     v0.16b, v0.16b, v1.16b         \n"  // average row 1 and 2
1369    MEMACCESS(2)
1370    "st1        {v0.16b}, [%2], #16            \n"
1371    "bgt        1b                             \n"
1372  : "+r"(src_uv),         // %0
1373    "+r"(src_uv_stride),  // %1
1374    "+r"(dst_uv),         // %2
1375    "+r"(pix)             // %3
1376  :
1377  : "cc", "memory", "v0", "v1"  // Clobber List
1378  );
1379}
1380#endif  // HAS_HALFROW_NEON
1381
1382// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
1383#ifdef HAS_ARGBTOBAYERROW_NEON
1384void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
1385                         uint32 selector, int pix) {
1386  asm volatile (
1387    "mov        v2.s[0], %w3                   \n"  // selector
1388  "1:                                          \n"
1389    MEMACCESS(0)
1390    "ld1        {v0.16b, v1.16b}, [%0], 32     \n"  // load row 8 pixels.
1391    "subs       %2, %2, #8                     \n"  // 8 processed per loop
1392    "tbl        v4.8b, {v0.16b}, v2.8b         \n"  // look up 4 pixels
1393    "tbl        v5.8b, {v1.16b}, v2.8b         \n"  // look up 4 pixels
1394    "trn1       v4.4s, v4.4s, v5.4s            \n"  // combine 8 pixels
1395    MEMACCESS(1)
1396    "st1        {v4.8b}, [%1], #8              \n"  // store 8.
1397    "bgt        1b                             \n"
1398  : "+r"(src_argb),   // %0
1399    "+r"(dst_bayer),  // %1
1400    "+r"(pix)         // %2
1401  : "r"(selector)     // %3
1402  : "cc", "memory", "v0", "v1", "v2", "v4", "v5"   // Clobber List
1403  );
1404}
1405#endif  // HAS_ARGBTOBAYERROW_NEON
1406
1407// Select G channels from ARGB.  e.g.  GGGGGGGG
1408#ifdef HAS_ARGBTOBAYERGGROW_NEON
1409void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
1410                           uint32 /*selector*/, int pix) {
1411  asm volatile (
1412  "1:                                          \n"
1413    MEMACCESS(0)
1414    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load row 8 pixels.
1415    "subs       %2, %2, #8                     \n"  // 8 processed per loop
1416    MEMACCESS(1)
1417    "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's.
1418    "bgt        1b                             \n"
1419  : "+r"(src_argb),   // %0
1420    "+r"(dst_bayer),  // %1
1421    "+r"(pix)         // %2
1422  :
1423  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1424  );
1425}
1426#endif  // HAS_ARGBTOBAYERGGROW_NEON
1427
1428// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1429#ifdef HAS_ARGBSHUFFLEROW_NEON
1430void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1431                         const uint8* shuffler, int pix) {
1432  asm volatile (
1433    MEMACCESS(3)
1434    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
1435  "1:                                          \n"
1436    MEMACCESS(0)
1437    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
1438    "subs       %2, %2, #4                     \n"  // 4 processed per loop
1439    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
1440    MEMACCESS(1)
1441    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
1442    "bgt        1b                             \n"
1443  : "+r"(src_argb),  // %0
1444    "+r"(dst_argb),  // %1
1445    "+r"(pix)        // %2
1446  : "r"(shuffler)    // %3
1447  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1448  );
1449}
1450#endif  // HAS_ARGBSHUFFLEROW_NEON
1451
1452#ifdef HAS_I422TOYUY2ROW_NEON
1453void I422ToYUY2Row_NEON(const uint8* src_y,
1454                        const uint8* src_u,
1455                        const uint8* src_v,
1456                        uint8* dst_yuy2, int width) {
1457  asm volatile (
1458    ".p2align   2                              \n"
1459  "1:                                          \n"
1460    MEMACCESS(0)
1461    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
1462    "mov        v2.8b, v1.8b                   \n"
1463    MEMACCESS(1)
1464    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
1465    MEMACCESS(2)
1466    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
1467    "subs       %4, %4, #16                    \n"  // 16 pixels
1468    MEMACCESS(3)
1469    "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 YUY2/16 pixels.
1470    "bgt        1b                             \n"
1471  : "+r"(src_y),     // %0
1472    "+r"(src_u),     // %1
1473    "+r"(src_v),     // %2
1474    "+r"(dst_yuy2),  // %3
1475    "+r"(width)      // %4
1476  :
1477  : "cc", "memory", "v0", "v1", "v2", "v3"
1478  );
1479}
1480#endif  // HAS_I422TOYUY2ROW_NEON
1481
1482#ifdef HAS_I422TOUYVYROW_NEON
1483void I422ToUYVYRow_NEON(const uint8* src_y,
1484                        const uint8* src_u,
1485                        const uint8* src_v,
1486                        uint8* dst_uyvy, int width) {
1487  asm volatile (
1488    ".p2align   2                              \n"
1489  "1:                                          \n"
1490    MEMACCESS(0)
1491    "ld2        {v1.8b, v2.8b}, [%0], #16      \n"  // load 16 Ys
1492    "mov        v3.8b, v2.8b                   \n"
1493    MEMACCESS(1)
1494    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
1495    MEMACCESS(2)
1496    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
1497    "subs       %4, %4, #16                    \n"  // 16 pixels
1498    MEMACCESS(3)
1499    "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 UYVY/16 pixels.
1500    "bgt        1b                             \n"
1501  : "+r"(src_y),     // %0
1502    "+r"(src_u),     // %1
1503    "+r"(src_v),     // %2
1504    "+r"(dst_uyvy),  // %3
1505    "+r"(width)      // %4
1506  :
1507  : "cc", "memory", "v0", "v1", "v2", "v3"
1508  );
1509}
1510#endif  // HAS_I422TOUYVYROW_NEON
1511
1512#ifdef HAS_ARGBTORGB565ROW_NEON
1513void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
1514  asm volatile (
1515    ".p2align   2                              \n"
1516  "1:                                          \n"
1517    MEMACCESS(0)
1518    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1519    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1520    ARGBTORGB565
1521    MEMACCESS(1)
1522    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
1523    "bgt        1b                             \n"
1524  : "+r"(src_argb),  // %0
1525    "+r"(dst_rgb565),  // %1
1526    "+r"(pix)        // %2
1527  :
1528  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1529  );
1530}
1531#endif  // HAS_ARGBTORGB565ROW_NEON
1532
1533#ifdef HAS_ARGBTOARGB1555ROW_NEON
1534void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1535                            int pix) {
1536  asm volatile (
1537    ".p2align   2                              \n"
1538  "1:                                          \n"
1539    MEMACCESS(0)
1540    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1541    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1542    ARGBTOARGB1555
1543    MEMACCESS(1)
1544    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
1545    "bgt        1b                             \n"
1546  : "+r"(src_argb),  // %0
1547    "+r"(dst_argb1555),  // %1
1548    "+r"(pix)        // %2
1549  :
1550  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1551  );
1552}
1553#endif  // HAS_ARGBTOARGB1555ROW_NEON
1554
1555#ifdef HAS_ARGBTOARGB4444ROW_NEON
1556void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1557                            int pix) {
1558  asm volatile (
1559    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
1560    ".p2align   2                              \n"
1561  "1:                                          \n"
1562    MEMACCESS(0)
1563    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1564    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1565    ARGBTOARGB4444
1566    MEMACCESS(1)
1567    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
1568    "bgt        1b                             \n"
1569  : "+r"(src_argb),      // %0
1570    "+r"(dst_argb4444),  // %1
1571    "+r"(pix)            // %2
1572  :
1573  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1574  );
1575}
1576#endif  // HAS_ARGBTOARGB4444ROW_NEON
1577
1578#ifdef HAS_ARGBTOYROW_NEON
1579void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1580  asm volatile (
1581    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1582    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1583    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1584    "movi       v7.8b, #16                     \n"  // Add 16 constant
1585    ".p2align   2                              \n"
1586  "1:                                          \n"
1587    MEMACCESS(0)
1588    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
1589    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1590    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1591    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1592    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1593    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1594    "uqadd      v0.8b, v0.8b, v7.8b            \n"
1595    MEMACCESS(1)
1596    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1597    "bgt        1b                             \n"
1598  : "+r"(src_argb),  // %0
1599    "+r"(dst_y),     // %1
1600    "+r"(pix)        // %2
1601  :
1602  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1603  );
1604}
1605#endif  // HAS_ARGBTOYROW_NEON
1606
1607#ifdef HAS_ARGBTOYJROW_NEON
1608void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1609  asm volatile (
1610    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
1611    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
1612    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
1613    ".p2align   2                              \n"
1614  "1:                                          \n"
1615    MEMACCESS(0)
1616    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
1617    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1618    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1619    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1620    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1621    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
1622    MEMACCESS(1)
1623    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1624    "bgt        1b                             \n"
1625  : "+r"(src_argb),  // %0
1626    "+r"(dst_y),     // %1
1627    "+r"(pix)        // %2
1628  :
1629  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1630  );
1631}
1632#endif  // HAS_ARGBTOYJROW_NEON
1633
1634// 8x1 pixels.
1635#ifdef HAS_ARGBTOUV444ROW_NEON
1636void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1637                         int pix) {
1638  asm volatile (
1639    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
1640    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
1641    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
1642    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
1643    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
1644    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1645    ".p2align   2                              \n"
1646  "1:                                          \n"
1647    MEMACCESS(0)
1648    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1649    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
1650    "vmull.u8   q2, d0, d24                    \n"  // B
1651    "vmlsl.u8   q2, d1, d25                    \n"  // G
1652    "vmlsl.u8   q2, d2, d26                    \n"  // R
1653    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
1654
1655    "vmull.u8   q3, d2, d24                    \n"  // R
1656    "vmlsl.u8   q3, d1, d28                    \n"  // G
1657    "vmlsl.u8   q3, d0, d27                    \n"  // B
1658    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
1659
1660    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
1661    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
1662
1663    MEMACCESS(1)
1664    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1665    MEMACCESS(2)
1666    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1667    "bgt        1b                             \n"
1668  : "+r"(src_argb),  // %0
1669    "+r"(dst_u),     // %1
1670    "+r"(dst_v),     // %2
1671    "+r"(pix)        // %3
1672  :
1673  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
1674  );
1675}
1676#endif  // HAS_ARGBTOUV444ROW_NEON
1677
1678// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1679#ifdef HAS_ARGBTOUV422ROW_NEON
1680void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1681                         int pix) {
1682  asm volatile (
1683    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1684    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1685    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1686    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1687    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1688    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1689    ".p2align   2                              \n"
1690  "1:                                          \n"
1691    MEMACCESS(0)
1692    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1693    MEMACCESS(0)
1694    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1695
1696    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1697    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1698    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1699
1700    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
1701    "vmul.s16   q8, q0, q10                    \n"  // B
1702    "vmls.s16   q8, q1, q11                    \n"  // G
1703    "vmls.s16   q8, q2, q12                    \n"  // R
1704    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
1705
1706    "vmul.s16   q9, q2, q10                    \n"  // R
1707    "vmls.s16   q9, q1, q14                    \n"  // G
1708    "vmls.s16   q9, q0, q13                    \n"  // B
1709    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
1710
1711    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
1712    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1713
1714    MEMACCESS(1)
1715    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1716    MEMACCESS(2)
1717    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1718    "bgt        1b                             \n"
1719  : "+r"(src_argb),  // %0
1720    "+r"(dst_u),     // %1
1721    "+r"(dst_v),     // %2
1722    "+r"(pix)        // %3
1723  :
1724  : "cc", "memory", "q0", "q1", "q2", "q3",
1725    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1726  );
1727}
1728#endif  // HAS_ARGBTOUV422ROW_NEON
1729
1730// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
1731#ifdef HAS_ARGBTOUV411ROW_NEON
1732void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1733                         int pix) {
1734  asm volatile (
1735    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1736    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1737    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1738    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1739    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1740    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1741    ".p2align   2                              \n"
1742  "1:                                          \n"
1743    MEMACCESS(0)
1744    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1745    MEMACCESS(0)
1746    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1747    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1748    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1749    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1750    MEMACCESS(0)
1751    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
1752    MEMACCESS(0)
1753    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
1754    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
1755    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
1756    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
1757
1758    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
1759    "vpadd.u16  d1, d8, d9                     \n"  // B
1760    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
1761    "vpadd.u16  d3, d10, d11                   \n"  // G
1762    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
1763    "vpadd.u16  d5, d12, d13                   \n"  // R
1764
1765    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1766    "vrshr.u16  q1, q1, #1                     \n"
1767    "vrshr.u16  q2, q2, #1                     \n"
1768
1769    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
1770    "vmul.s16   q8, q0, q10                    \n"  // B
1771    "vmls.s16   q8, q1, q11                    \n"  // G
1772    "vmls.s16   q8, q2, q12                    \n"  // R
1773    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
1774    "vmul.s16   q9, q2, q10                    \n"  // R
1775    "vmls.s16   q9, q1, q14                    \n"  // G
1776    "vmls.s16   q9, q0, q13                    \n"  // B
1777    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
1778    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
1779    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1780    MEMACCESS(1)
1781    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1782    MEMACCESS(2)
1783    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1784    "bgt        1b                             \n"
1785  : "+r"(src_argb),  // %0
1786    "+r"(dst_u),     // %1
1787    "+r"(dst_v),     // %2
1788    "+r"(pix)        // %3
1789  :
1790  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1791    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1792  );
1793}
1794#endif  // HAS_ARGBTOUV411ROW_NEON
1795
1796// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1797#define RGBTOUV(QB, QG, QR) \
1798    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
1799    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
1800    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
1801    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
1802    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
1803    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
1804    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
1805    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
1806    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
1807    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
1808
1809// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1810#ifdef HAS_ARGBTOUVROW_NEON
1811void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1812                      uint8* dst_u, uint8* dst_v, int pix) {
1813  asm volatile (
1814    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1815    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1816    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1817    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1818    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1819    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1820    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1821    ".p2align   2                              \n"
1822  "1:                                          \n"
1823    MEMACCESS(0)
1824    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1825    MEMACCESS(0)
1826    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1827    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1828    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1829    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1830    MEMACCESS(1)
1831    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1832    MEMACCESS(1)
1833    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
1834    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
1835    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1836    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1837
1838    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1839    "vrshr.u16  q1, q1, #1                     \n"
1840    "vrshr.u16  q2, q2, #1                     \n"
1841
1842    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1843    RGBTOUV(q0, q1, q2)
1844    MEMACCESS(2)
1845    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1846    MEMACCESS(3)
1847    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1848    "bgt        1b                             \n"
1849  : "+r"(src_argb),  // %0
1850    "+r"(src_stride_argb),  // %1
1851    "+r"(dst_u),     // %2
1852    "+r"(dst_v),     // %3
1853    "+r"(pix)        // %4
1854  :
1855  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1856    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1857  );
1858}
1859#endif  // HAS_ARGBTOUVROW_NEON
1860
1861// TODO(fbarchard): Subsample match C code.
1862#ifdef HAS_ARGBTOUVJROW_NEON
1863void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1864                       uint8* dst_u, uint8* dst_v, int pix) {
1865  asm volatile (
1866    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1867    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
1868    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
1869    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
1870    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
1871    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
1872    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1873    ".p2align   2                              \n"
1874  "1:                                          \n"
1875    MEMACCESS(0)
1876    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1877    MEMACCESS(0)
1878    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1879    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1880    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1881    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1882    MEMACCESS(1)
1883    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1884    MEMACCESS(1)
1885    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
1886    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
1887    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1888    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1889
1890    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1891    "vrshr.u16  q1, q1, #1                     \n"
1892    "vrshr.u16  q2, q2, #1                     \n"
1893
1894    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1895    RGBTOUV(q0, q1, q2)
1896    MEMACCESS(2)
1897    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1898    MEMACCESS(3)
1899    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1900    "bgt        1b                             \n"
1901  : "+r"(src_argb),  // %0
1902    "+r"(src_stride_argb),  // %1
1903    "+r"(dst_u),     // %2
1904    "+r"(dst_v),     // %3
1905    "+r"(pix)        // %4
1906  :
1907  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1908    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1909  );
1910}
1911#endif  // HAS_ARGBTOUVJROW_NEON
1912
1913#ifdef HAS_BGRATOUVROW_NEON
1914void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1915                      uint8* dst_u, uint8* dst_v, int pix) {
1916  asm volatile (
1917    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
1918    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1919    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1920    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1921    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1922    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1923    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1924    ".p2align   2                              \n"
1925  "1:                                          \n"
1926    MEMACCESS(0)
1927    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
1928    MEMACCESS(0)
1929    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
1930    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
1931    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
1932    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
1933    MEMACCESS(1)
1934    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
1935    MEMACCESS(1)
1936    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
1937    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
1938    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
1939    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
1940
1941    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
1942    "vrshr.u16  q2, q2, #1                     \n"
1943    "vrshr.u16  q3, q3, #1                     \n"
1944
1945    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1946    RGBTOUV(q3, q2, q1)
1947    MEMACCESS(2)
1948    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1949    MEMACCESS(3)
1950    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1951    "bgt        1b                             \n"
1952  : "+r"(src_bgra),  // %0
1953    "+r"(src_stride_bgra),  // %1
1954    "+r"(dst_u),     // %2
1955    "+r"(dst_v),     // %3
1956    "+r"(pix)        // %4
1957  :
1958  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1959    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1960  );
1961}
1962#endif  // HAS_BGRATOUVROW_NEON
1963
1964#ifdef HAS_ABGRTOUVROW_NEON
1965void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1966                      uint8* dst_u, uint8* dst_v, int pix) {
1967  asm volatile (
1968    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
1969    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1970    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1971    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1972    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1973    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1974    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1975    ".p2align   2                              \n"
1976  "1:                                          \n"
1977    MEMACCESS(0)
1978    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
1979    MEMACCESS(0)
1980    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
1981    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
1982    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1983    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1984    MEMACCESS(1)
1985    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
1986    MEMACCESS(1)
1987    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
1988    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
1989    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1990    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1991
1992    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1993    "vrshr.u16  q1, q1, #1                     \n"
1994    "vrshr.u16  q2, q2, #1                     \n"
1995
1996    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1997    RGBTOUV(q2, q1, q0)
1998    MEMACCESS(2)
1999    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2000    MEMACCESS(3)
2001    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2002    "bgt        1b                             \n"
2003  : "+r"(src_abgr),  // %0
2004    "+r"(src_stride_abgr),  // %1
2005    "+r"(dst_u),     // %2
2006    "+r"(dst_v),     // %3
2007    "+r"(pix)        // %4
2008  :
2009  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2010    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2011  );
2012}
2013#endif  // HAS_ABGRTOUVROW_NEON
2014
2015#ifdef HAS_RGBATOUVROW_NEON
2016void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
2017                      uint8* dst_u, uint8* dst_v, int pix) {
2018  asm volatile (
2019    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
2020    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
2021    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
2022    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
2023    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
2024    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2025    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2026    ".p2align   2                              \n"
2027  "1:                                          \n"
2028    MEMACCESS(0)
2029    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
2030    MEMACCESS(0)
2031    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
2032    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
2033    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
2034    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
2035    MEMACCESS(1)
2036    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
2037    MEMACCESS(1)
2038    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
2039    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
2040    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
2041    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
2042
2043    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
2044    "vrshr.u16  q1, q1, #1                     \n"
2045    "vrshr.u16  q2, q2, #1                     \n"
2046
2047    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
2048    RGBTOUV(q0, q1, q2)
2049    MEMACCESS(2)
2050    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2051    MEMACCESS(3)
2052    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2053    "bgt        1b                             \n"
2054  : "+r"(src_rgba),  // %0
2055    "+r"(src_stride_rgba),  // %1
2056    "+r"(dst_u),     // %2
2057    "+r"(dst_v),     // %3
2058    "+r"(pix)        // %4
2059  :
2060  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2061    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2062  );
2063}
2064#endif  // HAS_RGBATOUVROW_NEON
2065
2066#ifdef HAS_RGB24TOUVROW_NEON
2067void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
2068                       uint8* dst_u, uint8* dst_v, int pix) {
2069  asm volatile (
2070    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
2071    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
2072    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
2073    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
2074    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
2075    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2076    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2077    ".p2align   2                              \n"
2078  "1:                                          \n"
2079    MEMACCESS(0)
2080    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
2081    MEMACCESS(0)
2082    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
2083    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
2084    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
2085    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
2086    MEMACCESS(1)
2087    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
2088    MEMACCESS(1)
2089    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
2090    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
2091    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
2092    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
2093
2094    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
2095    "vrshr.u16  q1, q1, #1                     \n"
2096    "vrshr.u16  q2, q2, #1                     \n"
2097
2098    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
2099    RGBTOUV(q0, q1, q2)
2100    MEMACCESS(2)
2101    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2102    MEMACCESS(3)
2103    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2104    "bgt        1b                             \n"
2105  : "+r"(src_rgb24),  // %0
2106    "+r"(src_stride_rgb24),  // %1
2107    "+r"(dst_u),     // %2
2108    "+r"(dst_v),     // %3
2109    "+r"(pix)        // %4
2110  :
2111  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2112    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2113  );
2114}
2115#endif  // HAS_RGB24TOUVROW_NEON
2116
2117#ifdef HAS_RAWTOUVROW_NEON
2118void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
2119                     uint8* dst_u, uint8* dst_v, int pix) {
2120  asm volatile (
2121    "add        %1, %0, %1                     \n"  // src_stride + src_raw
2122    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
2123    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
2124    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
2125    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
2126    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2127    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2128    ".p2align   2                              \n"
2129  "1:                                          \n"
2130    MEMACCESS(0)
2131    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
2132    MEMACCESS(0)
2133    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
2134    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
2135    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
2136    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
2137    MEMACCESS(1)
2138    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
2139    MEMACCESS(1)
2140    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
2141    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
2142    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
2143    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
2144
2145    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
2146    "vrshr.u16  q1, q1, #1                     \n"
2147    "vrshr.u16  q2, q2, #1                     \n"
2148
2149    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
2150    RGBTOUV(q2, q1, q0)
2151    MEMACCESS(2)
2152    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2153    MEMACCESS(3)
2154    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2155    "bgt        1b                             \n"
2156  : "+r"(src_raw),  // %0
2157    "+r"(src_stride_raw),  // %1
2158    "+r"(dst_u),     // %2
2159    "+r"(dst_v),     // %3
2160    "+r"(pix)        // %4
2161  :
2162  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2163    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2164  );
2165}
2166#endif  // HAS_RAWTOUVROW_NEON
2167
2168// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2169#ifdef HAS_RGB565TOUVROW_NEON
2170void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
2171                        uint8* dst_u, uint8* dst_v, int pix) {
2172  asm volatile (
2173    "add        %1, %0, %1                     \n"  // src_stride + src_argb
2174    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
2175    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
2176    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
2177    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
2178    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2179    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2180    ".p2align   2                              \n"
2181  "1:                                          \n"
2182    MEMACCESS(0)
2183    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
2184    RGB565TOARGB
2185    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2186    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2187    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2188    MEMACCESS(0)
2189    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
2190    RGB565TOARGB
2191    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2192    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2193    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2194
2195    MEMACCESS(1)
2196    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
2197    RGB565TOARGB
2198    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2199    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2200    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2201    MEMACCESS(1)
2202    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
2203    RGB565TOARGB
2204    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2205    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2206    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2207
2208    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
2209    "vrshr.u16  q5, q5, #1                     \n"
2210    "vrshr.u16  q6, q6, #1                     \n"
2211
2212    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
2213    "vmul.s16   q8, q4, q10                    \n"  // B
2214    "vmls.s16   q8, q5, q11                    \n"  // G
2215    "vmls.s16   q8, q6, q12                    \n"  // R
2216    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
2217    "vmul.s16   q9, q6, q10                    \n"  // R
2218    "vmls.s16   q9, q5, q14                    \n"  // G
2219    "vmls.s16   q9, q4, q13                    \n"  // B
2220    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
2221    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
2222    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2223    MEMACCESS(2)
2224    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2225    MEMACCESS(3)
2226    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2227    "bgt        1b                             \n"
2228  : "+r"(src_rgb565),  // %0
2229    "+r"(src_stride_rgb565),  // %1
2230    "+r"(dst_u),     // %2
2231    "+r"(dst_v),     // %3
2232    "+r"(pix)        // %4
2233  :
2234  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2235    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2236  );
2237}
2238#endif  // HAS_RGB565TOUVROW_NEON
2239
2240// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2241#ifdef HAS_ARGB1555TOUVROW_NEON
2242void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
2243                        uint8* dst_u, uint8* dst_v, int pix) {
2244  asm volatile (
2245    "add        %1, %0, %1                     \n"  // src_stride + src_argb
2246    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
2247    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
2248    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
2249    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
2250    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2251    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2252    ".p2align   2                              \n"
2253  "1:                                          \n"
2254    MEMACCESS(0)
2255    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
2256    RGB555TOARGB
2257    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2258    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2259    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2260    MEMACCESS(0)
2261    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
2262    RGB555TOARGB
2263    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2264    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2265    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2266
2267    MEMACCESS(1)
2268    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
2269    RGB555TOARGB
2270    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2271    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2272    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2273    MEMACCESS(1)
2274    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
2275    RGB555TOARGB
2276    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2277    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2278    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2279
2280    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
2281    "vrshr.u16  q5, q5, #1                     \n"
2282    "vrshr.u16  q6, q6, #1                     \n"
2283
2284    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
2285    "vmul.s16   q8, q4, q10                    \n"  // B
2286    "vmls.s16   q8, q5, q11                    \n"  // G
2287    "vmls.s16   q8, q6, q12                    \n"  // R
2288    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
2289    "vmul.s16   q9, q6, q10                    \n"  // R
2290    "vmls.s16   q9, q5, q14                    \n"  // G
2291    "vmls.s16   q9, q4, q13                    \n"  // B
2292    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
2293    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
2294    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2295    MEMACCESS(2)
2296    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2297    MEMACCESS(3)
2298    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2299    "bgt        1b                             \n"
2300  : "+r"(src_argb1555),  // %0
2301    "+r"(src_stride_argb1555),  // %1
2302    "+r"(dst_u),     // %2
2303    "+r"(dst_v),     // %3
2304    "+r"(pix)        // %4
2305  :
2306  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2307    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2308  );
2309}
2310#endif  // HAS_ARGB1555TOUVROW_NEON
2311
2312// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2313#ifdef HAS_ARGB4444TOUVROW_NEON
2314void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
2315                          uint8* dst_u, uint8* dst_v, int pix) {
2316  asm volatile (
2317    "add        %1, %0, %1                     \n"  // src_stride + src_argb
2318    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
2319    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
2320    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
2321    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
2322    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2323    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2324    ".p2align   2                              \n"
2325  "1:                                          \n"
2326    MEMACCESS(0)
2327    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
2328    ARGB4444TOARGB
2329    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2330    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2331    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2332    MEMACCESS(0)
2333    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
2334    ARGB4444TOARGB
2335    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2336    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2337    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2338
2339    MEMACCESS(1)
2340    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
2341    ARGB4444TOARGB
2342    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2343    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2344    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2345    MEMACCESS(1)
2346    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
2347    ARGB4444TOARGB
2348    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2349    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2350    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2351
2352    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
2353    "vrshr.u16  q5, q5, #1                     \n"
2354    "vrshr.u16  q6, q6, #1                     \n"
2355
2356    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
2357    "vmul.s16   q8, q4, q10                    \n"  // B
2358    "vmls.s16   q8, q5, q11                    \n"  // G
2359    "vmls.s16   q8, q6, q12                    \n"  // R
2360    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
2361    "vmul.s16   q9, q6, q10                    \n"  // R
2362    "vmls.s16   q9, q5, q14                    \n"  // G
2363    "vmls.s16   q9, q4, q13                    \n"  // B
2364    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
2365    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
2366    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2367    MEMACCESS(2)
2368    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2369    MEMACCESS(3)
2370    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2371    "bgt        1b                             \n"
2372  : "+r"(src_argb4444),  // %0
2373    "+r"(src_stride_argb4444),  // %1
2374    "+r"(dst_u),     // %2
2375    "+r"(dst_v),     // %3
2376    "+r"(pix)        // %4
2377  :
2378  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2379    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2380  );
2381}
2382#endif  // HAS_ARGB4444TOUVROW_NEON
2383
2384#ifdef HAS_RGB565TOYROW_NEON
2385void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
2386  asm volatile (
2387    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2388    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2389    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2390    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2391    ".p2align   2                              \n"
2392  "1:                                          \n"
2393    MEMACCESS(0)
2394    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
2395    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2396    RGB565TOARGB
2397    "vmull.u8   q2, d0, d24                    \n"  // B
2398    "vmlal.u8   q2, d1, d25                    \n"  // G
2399    "vmlal.u8   q2, d2, d26                    \n"  // R
2400    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2401    "vqadd.u8   d0, d27                        \n"
2402    MEMACCESS(1)
2403    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2404    "bgt        1b                             \n"
2405  : "+r"(src_rgb565),  // %0
2406    "+r"(dst_y),       // %1
2407    "+r"(pix)          // %2
2408  :
2409  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2410  );
2411}
2412#endif  // HAS_RGB565TOYROW_NEON
2413
2414#ifdef HAS_ARGB1555TOYROW_NEON
2415void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
2416  asm volatile (
2417    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2418    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2419    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2420    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2421    ".p2align   2                              \n"
2422  "1:                                          \n"
2423    MEMACCESS(0)
2424    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
2425    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2426    ARGB1555TOARGB
2427    "vmull.u8   q2, d0, d24                    \n"  // B
2428    "vmlal.u8   q2, d1, d25                    \n"  // G
2429    "vmlal.u8   q2, d2, d26                    \n"  // R
2430    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2431    "vqadd.u8   d0, d27                        \n"
2432    MEMACCESS(1)
2433    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2434    "bgt        1b                             \n"
2435  : "+r"(src_argb1555),  // %0
2436    "+r"(dst_y),         // %1
2437    "+r"(pix)            // %2
2438  :
2439  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2440  );
2441}
2442#endif  // HAS_ARGB1555TOYROW_NEON
2443
2444#ifdef HAS_ARGB4444TOYROW_NEON
2445void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
2446  asm volatile (
2447    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2448    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2449    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2450    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2451    ".p2align   2                              \n"
2452  "1:                                          \n"
2453    MEMACCESS(0)
2454    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
2455    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2456    ARGB4444TOARGB
2457    "vmull.u8   q2, d0, d24                    \n"  // B
2458    "vmlal.u8   q2, d1, d25                    \n"  // G
2459    "vmlal.u8   q2, d2, d26                    \n"  // R
2460    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2461    "vqadd.u8   d0, d27                        \n"
2462    MEMACCESS(1)
2463    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2464    "bgt        1b                             \n"
2465  : "+r"(src_argb4444),  // %0
2466    "+r"(dst_y),         // %1
2467    "+r"(pix)            // %2
2468  :
2469  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2470  );
2471}
2472#endif  // HAS_ARGB4444TOYROW_NEON
2473
2474#ifdef HAS_BGRATOYROW_NEON
2475void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
2476  asm volatile (
2477    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2478    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2479    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2480    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2481    ".p2align   2                              \n"
2482  "1:                                          \n"
2483    MEMACCESS(0)
2484    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
2485    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2486    "vmull.u8   q8, d1, d4                     \n"  // R
2487    "vmlal.u8   q8, d2, d5                     \n"  // G
2488    "vmlal.u8   q8, d3, d6                     \n"  // B
2489    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2490    "vqadd.u8   d0, d7                         \n"
2491    MEMACCESS(1)
2492    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2493    "bgt        1b                             \n"
2494  : "+r"(src_bgra),  // %0
2495    "+r"(dst_y),     // %1
2496    "+r"(pix)        // %2
2497  :
2498  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2499  );
2500}
2501#endif  // HAS_BGRATOYROW_NEON
2502
2503#ifdef HAS_ABGRTOYROW_NEON
2504void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
2505  asm volatile (
2506    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2507    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2508    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2509    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2510    ".p2align   2                              \n"
2511  "1:                                          \n"
2512    MEMACCESS(0)
2513    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
2514    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2515    "vmull.u8   q8, d0, d4                     \n"  // R
2516    "vmlal.u8   q8, d1, d5                     \n"  // G
2517    "vmlal.u8   q8, d2, d6                     \n"  // B
2518    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2519    "vqadd.u8   d0, d7                         \n"
2520    MEMACCESS(1)
2521    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2522    "bgt        1b                             \n"
2523  : "+r"(src_abgr),  // %0
2524    "+r"(dst_y),  // %1
2525    "+r"(pix)        // %2
2526  :
2527  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2528  );
2529}
2530#endif  // HAS_ABGRTOYROW_NEON
2531
2532#ifdef HAS_RGBATOYROW_NEON
2533void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
2534  asm volatile (
2535    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
2536    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2537    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
2538    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2539    ".p2align   2                              \n"
2540  "1:                                          \n"
2541    MEMACCESS(0)
2542    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
2543    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2544    "vmull.u8   q8, d1, d4                     \n"  // B
2545    "vmlal.u8   q8, d2, d5                     \n"  // G
2546    "vmlal.u8   q8, d3, d6                     \n"  // R
2547    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2548    "vqadd.u8   d0, d7                         \n"
2549    MEMACCESS(1)
2550    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2551    "bgt        1b                             \n"
2552  : "+r"(src_rgba),  // %0
2553    "+r"(dst_y),  // %1
2554    "+r"(pix)        // %2
2555  :
2556  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2557  );
2558}
2559#endif  // HAS_RGBATOYROW_NEON
2560
2561#ifdef HAS_RGB24TOYROW_NEON
2562void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
2563  asm volatile (
2564    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
2565    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2566    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
2567    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2568    ".p2align   2                              \n"
2569  "1:                                          \n"
2570    MEMACCESS(0)
2571    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
2572    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2573    "vmull.u8   q8, d0, d4                     \n"  // B
2574    "vmlal.u8   q8, d1, d5                     \n"  // G
2575    "vmlal.u8   q8, d2, d6                     \n"  // R
2576    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2577    "vqadd.u8   d0, d7                         \n"
2578    MEMACCESS(1)
2579    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2580    "bgt        1b                             \n"
2581  : "+r"(src_rgb24),  // %0
2582    "+r"(dst_y),  // %1
2583    "+r"(pix)        // %2
2584  :
2585  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2586  );
2587}
2588#endif  // HAS_RGB24TOYROW_NEON
2589
2590#ifdef HAS_RAWTOYROW_NEON
2591void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
2592  asm volatile (
2593    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2594    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2595    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2596    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2597    ".p2align   2                              \n"
2598  "1:                                          \n"
2599    MEMACCESS(0)
2600    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
2601    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2602    "vmull.u8   q8, d0, d4                     \n"  // B
2603    "vmlal.u8   q8, d1, d5                     \n"  // G
2604    "vmlal.u8   q8, d2, d6                     \n"  // R
2605    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2606    "vqadd.u8   d0, d7                         \n"
2607    MEMACCESS(1)
2608    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2609    "bgt        1b                             \n"
2610  : "+r"(src_raw),  // %0
2611    "+r"(dst_y),  // %1
2612    "+r"(pix)        // %2
2613  :
2614  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2615  );
2616}
2617#endif  // HAS_RAWTOYROW_NEON
2618
2619// Bilinear filter 16x2 -> 16x1
2620#ifdef HAS_INTERPOLATEROW_NEON
2621void InterpolateRow_NEON(uint8* dst_ptr,
2622                         const uint8* src_ptr, ptrdiff_t src_stride,
2623                         int dst_width, int source_y_fraction) {
2624  asm volatile (
2625    "cmp        %4, #0                         \n"
2626    "beq        100f                           \n"
2627    "add        %2, %1                         \n"
2628    "cmp        %4, #64                        \n"
2629    "beq        75f                            \n"
2630    "cmp        %4, #128                       \n"
2631    "beq        50f                            \n"
2632    "cmp        %4, #192                       \n"
2633    "beq        25f                            \n"
2634
2635    "vdup.8     d5, %4                         \n"
2636    "rsb        %4, #256                       \n"
2637    "vdup.8     d4, %4                         \n"
2638    // General purpose row blend.
2639  "1:                                          \n"
2640    MEMACCESS(1)
2641    "vld1.8     {q0}, [%1]!                    \n"
2642    MEMACCESS(2)
2643    "vld1.8     {q1}, [%2]!                    \n"
2644    "subs       %3, %3, #16                    \n"
2645    "vmull.u8   q13, d0, d4                    \n"
2646    "vmull.u8   q14, d1, d4                    \n"
2647    "vmlal.u8   q13, d2, d5                    \n"
2648    "vmlal.u8   q14, d3, d5                    \n"
2649    "vrshrn.u16 d0, q13, #8                    \n"
2650    "vrshrn.u16 d1, q14, #8                    \n"
2651    MEMACCESS(0)
2652    "vst1.8     {q0}, [%0]!                    \n"
2653    "bgt        1b                             \n"
2654    "b          99f                            \n"
2655
2656    // Blend 25 / 75.
2657  "25:                                         \n"
2658    MEMACCESS(1)
2659    "vld1.8     {q0}, [%1]!                    \n"
2660    MEMACCESS(2)
2661    "vld1.8     {q1}, [%2]!                    \n"
2662    "subs       %3, %3, #16                    \n"
2663    "vrhadd.u8  q0, q1                         \n"
2664    "vrhadd.u8  q0, q1                         \n"
2665    MEMACCESS(0)
2666    "vst1.8     {q0}, [%0]!                    \n"
2667    "bgt        25b                            \n"
2668    "b          99f                            \n"
2669
2670    // Blend 50 / 50.
2671  "50:                                         \n"
2672    MEMACCESS(1)
2673    "vld1.8     {q0}, [%1]!                    \n"
2674    MEMACCESS(2)
2675    "vld1.8     {q1}, [%2]!                    \n"
2676    "subs       %3, %3, #16                    \n"
2677    "vrhadd.u8  q0, q1                         \n"
2678    MEMACCESS(0)
2679    "vst1.8     {q0}, [%0]!                    \n"
2680    "bgt        50b                            \n"
2681    "b          99f                            \n"
2682
2683    // Blend 75 / 25.
2684  "75:                                         \n"
2685    MEMACCESS(1)
2686    "vld1.8     {q1}, [%1]!                    \n"
2687    MEMACCESS(2)
2688    "vld1.8     {q0}, [%2]!                    \n"
2689    "subs       %3, %3, #16                    \n"
2690    "vrhadd.u8  q0, q1                         \n"
2691    "vrhadd.u8  q0, q1                         \n"
2692    MEMACCESS(0)
2693    "vst1.8     {q0}, [%0]!                    \n"
2694    "bgt        75b                            \n"
2695    "b          99f                            \n"
2696
2697    // Blend 100 / 0 - Copy row unchanged.
2698  "100:                                        \n"
2699    MEMACCESS(1)
2700    "vld1.8     {q0}, [%1]!                    \n"
2701    "subs       %3, %3, #16                    \n"
2702    MEMACCESS(0)
2703    "vst1.8     {q0}, [%0]!                    \n"
2704    "bgt        100b                           \n"
2705
2706  "99:                                         \n"
2707  : "+r"(dst_ptr),          // %0
2708    "+r"(src_ptr),          // %1
2709    "+r"(src_stride),       // %2
2710    "+r"(dst_width),        // %3
2711    "+r"(source_y_fraction) // %4
2712  :
2713  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
2714  );
2715}
2716#endif  // HAS_INTERPOLATEROW_NEON
2717
2718// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2719#ifdef HAS_ARGBBLENDROW_NEON
2720void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2721                       uint8* dst_argb, int width) {
2722  asm volatile (
2723    "subs       %3, #8                         \n"
2724    "blt        89f                            \n"
2725    // Blend 8 pixels.
2726  "8:                                          \n"
2727    MEMACCESS(0)
2728    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
2729    MEMACCESS(1)
2730    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
2731    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2732    "vmull.u8   q10, d4, d3                    \n"  // db * a
2733    "vmull.u8   q11, d5, d3                    \n"  // dg * a
2734    "vmull.u8   q12, d6, d3                    \n"  // dr * a
2735    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
2736    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
2737    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
2738    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
2739    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
2740    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
2741    "vqadd.u8   d2, d2, d6                     \n"  // + sr
2742    "vmov.u8    d3, #255                       \n"  // a = 255
2743    MEMACCESS(2)
2744    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
2745    "bge        8b                             \n"
2746
2747  "89:                                         \n"
2748    "adds       %3, #8-1                       \n"
2749    "blt        99f                            \n"
2750
2751    // Blend 1 pixels.
2752  "1:                                          \n"
2753    MEMACCESS(0)
2754    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
2755    MEMACCESS(1)
2756    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
2757    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
2758    "vmull.u8   q10, d4, d3                    \n"  // db * a
2759    "vmull.u8   q11, d5, d3                    \n"  // dg * a
2760    "vmull.u8   q12, d6, d3                    \n"  // dr * a
2761    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
2762    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
2763    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
2764    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
2765    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
2766    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
2767    "vqadd.u8   d2, d2, d6                     \n"  // + sr
2768    "vmov.u8    d3, #255                       \n"  // a = 255
2769    MEMACCESS(2)
2770    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
2771    "bge        1b                             \n"
2772
2773  "99:                                         \n"
2774
2775  : "+r"(src_argb0),    // %0
2776    "+r"(src_argb1),    // %1
2777    "+r"(dst_argb),     // %2
2778    "+r"(width)         // %3
2779  :
2780  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
2781  );
2782}
2783#endif  // HAS_ARGBBLENDROW_NEON
2784
2785// Attenuate 8 pixels at a time.
2786#ifdef HAS_ARGBATTENUATEROW_NEON
2787void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2788  asm volatile (
2789    // Attenuate 8 pixels.
2790  "1:                                          \n"
2791    MEMACCESS(0)
2792    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
2793    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2794    "vmull.u8   q10, d0, d3                    \n"  // b * a
2795    "vmull.u8   q11, d1, d3                    \n"  // g * a
2796    "vmull.u8   q12, d2, d3                    \n"  // r * a
2797    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
2798    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
2799    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
2800    MEMACCESS(1)
2801    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
2802    "bgt        1b                             \n"
2803  : "+r"(src_argb),   // %0
2804    "+r"(dst_argb),   // %1
2805    "+r"(width)       // %2
2806  :
2807  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
2808  );
2809}
2810#endif  // HAS_ARGBATTENUATEROW_NEON
2811
2812// Quantize 8 ARGB pixels (32 bytes).
2813// dst = (dst * scale >> 16) * interval_size + interval_offset;
2814#ifdef HAS_ARGBQUANTIZEROW_NEON
2815void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2816                          int interval_offset, int width) {
2817  asm volatile (
2818    "vdup.u16   q8, %2                         \n"
2819    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
2820    "vdup.u16   q9, %3                         \n"  // interval multiply.
2821    "vdup.u16   q10, %4                        \n"  // interval add
2822
2823    // 8 pixel loop.
2824    ".p2align   2                              \n"
2825  "1:                                          \n"
2826    MEMACCESS(0)
2827    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
2828    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
2829    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
2830    "vmovl.u8   q1, d2                         \n"
2831    "vmovl.u8   q2, d4                         \n"
2832    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
2833    "vqdmulh.s16 q1, q1, q8                    \n"  // g
2834    "vqdmulh.s16 q2, q2, q8                    \n"  // r
2835    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
2836    "vmul.u16   q1, q1, q9                     \n"  // g
2837    "vmul.u16   q2, q2, q9                     \n"  // r
2838    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
2839    "vadd.u16   q1, q1, q10                    \n"  // g
2840    "vadd.u16   q2, q2, q10                    \n"  // r
2841    "vqmovn.u16 d0, q0                         \n"
2842    "vqmovn.u16 d2, q1                         \n"
2843    "vqmovn.u16 d4, q2                         \n"
2844    MEMACCESS(0)
2845    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
2846    "bgt        1b                             \n"
2847  : "+r"(dst_argb),       // %0
2848    "+r"(width)           // %1
2849  : "r"(scale),           // %2
2850    "r"(interval_size),   // %3
2851    "r"(interval_offset)  // %4
2852  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
2853  );
2854}
2855#endif  // HAS_ARGBQUANTIZEROW_NEON
2856
2857// Shade 8 pixels at a time by specified value.
2858// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2859// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2860#ifdef HAS_ARGBSHADEROW_NEON
2861void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2862                       uint32 value) {
2863  asm volatile (
2864    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
2865    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
2866    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
2867
2868    // 8 pixel loop.
2869    ".p2align   2                              \n"
2870  "1:                                          \n"
2871    MEMACCESS(0)
2872    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
2873    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2874    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
2875    "vmovl.u8   q11, d22                       \n"
2876    "vmovl.u8   q12, d24                       \n"
2877    "vmovl.u8   q13, d26                       \n"
2878    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
2879    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
2880    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
2881    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
2882    "vqmovn.u16 d20, q10                       \n"
2883    "vqmovn.u16 d22, q11                       \n"
2884    "vqmovn.u16 d24, q12                       \n"
2885    "vqmovn.u16 d26, q13                       \n"
2886    MEMACCESS(1)
2887    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
2888    "bgt        1b                             \n"
2889  : "+r"(src_argb),       // %0
2890    "+r"(dst_argb),       // %1
2891    "+r"(width)           // %2
2892  : "r"(value)            // %3
2893  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
2894  );
2895}
2896#endif  // HAS_ARGBSHADEROW_NEON
2897
2898// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2899// Similar to ARGBToYJ but stores ARGB.
2900// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2901#ifdef HAS_ARGBGRAYROW_NEON
2902void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2903  asm volatile (
2904    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
2905    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
2906    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
2907    ".p2align   2                              \n"
2908  "1:                                          \n"
2909    MEMACCESS(0)
2910    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2911    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2912    "vmull.u8   q2, d0, d24                    \n"  // B
2913    "vmlal.u8   q2, d1, d25                    \n"  // G
2914    "vmlal.u8   q2, d2, d26                    \n"  // R
2915    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
2916    "vmov       d1, d0                         \n"  // G
2917    "vmov       d2, d0                         \n"  // R
2918    MEMACCESS(1)
2919    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
2920    "bgt        1b                             \n"
2921  : "+r"(src_argb),  // %0
2922    "+r"(dst_argb),  // %1
2923    "+r"(width)      // %2
2924  :
2925  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
2926  );
2927}
2928#endif  // HAS_ARGBGRAYROW_NEON
2929
2930// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2931//    b = (r * 35 + g * 68 + b * 17) >> 7
2932//    g = (r * 45 + g * 88 + b * 22) >> 7
2933//    r = (r * 50 + g * 98 + b * 24) >> 7
2934
2935#ifdef HAS_ARGBSEPIAROW_NEON
2936void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2937  asm volatile (
2938    "vmov.u8    d20, #17                       \n"  // BB coefficient
2939    "vmov.u8    d21, #68                       \n"  // BG coefficient
2940    "vmov.u8    d22, #35                       \n"  // BR coefficient
2941    "vmov.u8    d24, #22                       \n"  // GB coefficient
2942    "vmov.u8    d25, #88                       \n"  // GG coefficient
2943    "vmov.u8    d26, #45                       \n"  // GR coefficient
2944    "vmov.u8    d28, #24                       \n"  // BB coefficient
2945    "vmov.u8    d29, #98                       \n"  // BG coefficient
2946    "vmov.u8    d30, #50                       \n"  // BR coefficient
2947    ".p2align   2                              \n"
2948  "1:                                          \n"
2949    MEMACCESS(0)
2950    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
2951    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
2952    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
2953    "vmlal.u8   q2, d1, d21                    \n"  // G
2954    "vmlal.u8   q2, d2, d22                    \n"  // R
2955    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
2956    "vmlal.u8   q3, d1, d25                    \n"  // G
2957    "vmlal.u8   q3, d2, d26                    \n"  // R
2958    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
2959    "vmlal.u8   q8, d1, d29                    \n"  // G
2960    "vmlal.u8   q8, d2, d30                    \n"  // R
2961    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
2962    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
2963    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
2964    MEMACCESS(0)
2965    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
2966    "bgt        1b                             \n"
2967  : "+r"(dst_argb),  // %0
2968    "+r"(width)      // %1
2969  :
2970  : "cc", "memory", "q0", "q1", "q2", "q3",
2971    "q10", "q11", "q12", "q13", "q14", "q15"
2972  );
2973}
2974#endif  // HAS_ARGBSEPIAROW_NEON
2975
2976// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2977// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2978// needs to saturate.  Consider doing a non-saturating version.
2979#ifdef HAS_ARGBCOLORMATRIXROW_NEON
2980void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2981                             const int8* matrix_argb, int width) {
2982  asm volatile (
2983    MEMACCESS(3)
2984    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
2985    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
2986    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
2987
2988    ".p2align   2                              \n"
2989  "1:                                          \n"
2990    MEMACCESS(0)
2991    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
2992    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2993    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
2994    "vmovl.u8   q9, d18                        \n"  // g
2995    "vmovl.u8   q10, d20                       \n"  // r
2996    "vmovl.u8   q15, d22                       \n"  // a
2997    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
2998    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
2999    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
3000    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
3001    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
3002    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
3003    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
3004    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
3005    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
3006    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
3007    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
3008    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
3009    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
3010    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
3011    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
3012    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
3013    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
3014    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
3015    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
3016    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
3017    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
3018    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
3019    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
3020    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
3021    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
3022    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
3023    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
3024    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
3025    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
3026    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
3027    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
3028    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
3029    MEMACCESS(1)
3030    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
3031    "bgt        1b                             \n"
3032  : "+r"(src_argb),   // %0
3033    "+r"(dst_argb),   // %1
3034    "+r"(width)       // %2
3035  : "r"(matrix_argb)  // %3
3036  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
3037    "q10", "q11", "q12", "q13", "q14", "q15"
3038  );
3039}
3040#endif  // HAS_ARGBCOLORMATRIXROW_NEON
3041
3042// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
3043// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
3044#ifdef HAS_ARGBMULTIPLYROW_NEON
3045void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3046                          uint8* dst_argb, int width) {
3047  asm volatile (
3048    // 8 pixel loop.
3049    ".p2align   2                              \n"
3050  "1:                                          \n"
3051    MEMACCESS(0)
3052    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
3053    MEMACCESS(1)
3054    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
3055    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
3056    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
3057    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
3058    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
3059    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
3060    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
3061    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
3062    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
3063    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
3064    MEMACCESS(2)
3065    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
3066    "bgt        1b                             \n"
3067
3068  : "+r"(src_argb0),  // %0
3069    "+r"(src_argb1),  // %1
3070    "+r"(dst_argb),   // %2
3071    "+r"(width)       // %3
3072  :
3073  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3074  );
3075}
3076#endif  // HAS_ARGBMULTIPLYROW_NEON
3077
3078// Add 2 rows of ARGB pixels together, 8 pixels at a time.
3079#ifdef HAS_ARGBADDROW_NEON
3080void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3081                     uint8* dst_argb, int width) {
3082  asm volatile (
3083    // 8 pixel loop.
3084    ".p2align   2                              \n"
3085  "1:                                          \n"
3086    MEMACCESS(0)
3087    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
3088    MEMACCESS(1)
3089    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
3090    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
3091    "uqadd      v0.8b, v0.8b, v4.8b            \n"
3092    "uqadd      v1.8b, v1.8b, v5.8b            \n"
3093    "uqadd      v2.8b, v2.8b, v6.8b            \n"
3094    "uqadd      v3.8b, v3.8b, v7.8b            \n"
3095    MEMACCESS(2)
3096    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
3097    "bgt        1b                             \n"
3098
3099  : "+r"(src_argb0),  // %0
3100    "+r"(src_argb1),  // %1
3101    "+r"(dst_argb),   // %2
3102    "+r"(width)       // %3
3103  :
3104  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3105  );
3106}
3107#endif  // HAS_ARGBADDROW_NEON
3108
3109// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
3110#ifdef HAS_ARGBSUBTRACTROW_NEON
3111void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3112                          uint8* dst_argb, int width) {
3113  asm volatile (
3114    // 8 pixel loop.
3115    ".p2align   2                              \n"
3116  "1:                                          \n"
3117    MEMACCESS(0)
3118    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
3119    MEMACCESS(1)
3120    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
3121    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
3122    "uqsub      v0.8b, v0.8b, v4.8b            \n"
3123    "uqsub      v1.8b, v1.8b, v5.8b            \n"
3124    "uqsub      v2.8b, v2.8b, v6.8b            \n"
3125    "uqsub      v3.8b, v3.8b, v7.8b            \n"
3126    MEMACCESS(2)
3127    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
3128    "bgt        1b                             \n"
3129
3130  : "+r"(src_argb0),  // %0
3131    "+r"(src_argb1),  // %1
3132    "+r"(dst_argb),   // %2
3133    "+r"(width)       // %3
3134  :
3135  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3136  );
3137}
3138#endif  // HAS_ARGBSUBTRACTROW_NEON
3139
3140// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
3141// A = 255
3142// R = Sobel
3143// G = Sobel
3144// B = Sobel
3145#ifdef HAS_SOBELROW_NEON
3146void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3147                     uint8* dst_argb, int width) {
3148  asm volatile (
3149    "movi       v3.8b, #255                    \n"  // alpha
3150    // 8 pixel loop.
3151    ".p2align   2                              \n"
3152  "1:                                          \n"
3153    MEMACCESS(0)
3154    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
3155    MEMACCESS(1)
3156    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
3157    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
3158    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
3159    "mov        v1.8b, v0.8b                   \n"
3160    "mov        v2.8b, v0.8b                   \n"
3161    MEMACCESS(2)
3162    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
3163    "bgt        1b                             \n"
3164  : "+r"(src_sobelx),  // %0
3165    "+r"(src_sobely),  // %1
3166    "+r"(dst_argb),    // %2
3167    "+r"(width)        // %3
3168  :
3169  : "cc", "memory", "v0", "v1", "v2", "v3"
3170  );
3171}
3172#endif  // HAS_SOBELROW_NEON
3173
3174// Adds Sobel X and Sobel Y and stores Sobel into plane.
3175#ifdef HAS_SOBELTOPLANEROW_NEON
3176void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3177                          uint8* dst_y, int width) {
3178  asm volatile (
3179    // 16 pixel loop.
3180    ".p2align   2                              \n"
3181  "1:                                          \n"
3182    MEMACCESS(0)
3183    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
3184    MEMACCESS(1)
3185    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
3186    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
3187    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
3188    MEMACCESS(2)
3189    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
3190    "bgt        1b                             \n"
3191  : "+r"(src_sobelx),  // %0
3192    "+r"(src_sobely),  // %1
3193    "+r"(dst_y),       // %2
3194    "+r"(width)        // %3
3195  :
3196  : "cc", "memory", "v0", "v1"
3197  );
3198}
3199#endif  // HAS_SOBELTOPLANEROW_NEON
3200
3201// Mixes Sobel X, Sobel Y and Sobel into ARGB.
3202// A = 255
3203// R = Sobel X
3204// G = Sobel
3205// B = Sobel Y
3206#ifdef HAS_SOBELXYROW_NEON
3207void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3208                     uint8* dst_argb, int width) {
3209  asm volatile (
3210    "movi       v3.8b, #255                    \n"  // alpha
3211    // 8 pixel loop.
3212    ".p2align   2                              \n"
3213  "1:                                          \n"
3214    MEMACCESS(0)
3215    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
3216    MEMACCESS(1)
3217    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
3218    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
3219    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
3220    MEMACCESS(2)
3221    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
3222    "bgt        1b                             \n"
3223  : "+r"(src_sobelx),  // %0
3224    "+r"(src_sobely),  // %1
3225    "+r"(dst_argb),    // %2
3226    "+r"(width)        // %3
3227  :
3228  : "cc", "memory", "v0", "v1", "v2", "v3"
3229  );
3230}
3231#endif  // HAS_SOBELXYROW_NEON
3232
3233// SobelX as a matrix is
3234// -1  0  1
3235// -2  0  2
3236// -1  0  1
3237#ifdef HAS_SOBELXROW_NEON
3238void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
3239                    const uint8* src_y2, uint8* dst_sobelx, int width) {
3240  asm volatile (
3241    ".p2align   2                              \n"
3242  "1:                                          \n"
3243    MEMACCESS(0)
3244    "ld1        {v0.8b}, [%0],%5               \n"  // top
3245    MEMACCESS(0)
3246    "ld1        {v1.8b}, [%0],%6               \n"
3247    "usubl      v0.8h, v0.8b, v1.8b            \n"
3248    MEMACCESS(1)
3249    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
3250    MEMACCESS(1)
3251    "ld1        {v3.8b}, [%1],%6               \n"
3252    "usubl      v1.8h, v2.8b, v3.8b            \n"
3253    "add        v0.8h, v0.8h, v1.8h            \n"
3254    "add        v0.8h, v0.8h, v1.8h            \n"
3255    MEMACCESS(2)
3256    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
3257    MEMACCESS(2)
3258    "ld1        {v3.8b}, [%2],%6               \n"
3259    "subs       %4, %4, #8                     \n"  // 8 pixels
3260    "usubl      v1.8h, v2.8b, v3.8b            \n"
3261    "add        v0.8h, v0.8h, v1.8h            \n"
3262    "abs        v0.8h, v0.8h                   \n"
3263    "uqxtn      v0.8b, v0.8h                   \n"
3264    MEMACCESS(3)
3265    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
3266    "bgt        1b                             \n"
3267  : "+r"(src_y0),      // %0
3268    "+r"(src_y1),      // %1
3269    "+r"(src_y2),      // %2
3270    "+r"(dst_sobelx),  // %3
3271    "+r"(width)        // %4
3272  : "r"(2),            // %5
3273    "r"(6)             // %6
3274  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
3275  );
3276}
3277#endif  // HAS_SOBELXROW_NEON
3278
3279// SobelY as a matrix is
3280// -1 -2 -1
3281//  0  0  0
3282//  1  2  1
3283#ifdef HAS_SOBELYROW_NEON
3284void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
3285                    uint8* dst_sobely, int width) {
3286  asm volatile (
3287    ".p2align   2                              \n"
3288  "1:                                          \n"
3289    MEMACCESS(0)
3290    "ld1        {v0.8b}, [%0],%4               \n"  // left
3291    MEMACCESS(1)
3292    "ld1        {v1.8b}, [%1],%4               \n"
3293    "usubl      v0.8h, v0.8b, v1.8b            \n"
3294    MEMACCESS(0)
3295    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
3296    MEMACCESS(1)
3297    "ld1        {v3.8b}, [%1],%4               \n"
3298    "usubl      v1.8h, v2.8b, v3.8b            \n"
3299    "add        v0.8h, v0.8h, v1.8h            \n"
3300    "add        v0.8h, v0.8h, v1.8h            \n"
3301    MEMACCESS(0)
3302    "ld1        {v2.8b}, [%0],%5               \n"  // right
3303    MEMACCESS(1)
3304    "ld1        {v3.8b}, [%1],%5               \n"
3305    "subs       %3, %3, #8                     \n"  // 8 pixels
3306    "usubl      v1.8h, v2.8b, v3.8b            \n"
3307    "add        v0.8h, v0.8h, v1.8h            \n"
3308    "abs        v0.8h, v0.8h                   \n"
3309    "uqxtn      v0.8b, v0.8h                   \n"
3310    MEMACCESS(2)
3311    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
3312    "bgt        1b                             \n"
3313  : "+r"(src_y0),      // %0
3314    "+r"(src_y1),      // %1
3315    "+r"(dst_sobely),  // %2
3316    "+r"(width)        // %3
3317  : "r"(1),            // %4
3318    "r"(6)             // %5
3319  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
3320  );
3321}
3322#endif  // HAS_SOBELYROW_NEON
3323#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3324
3325#ifdef __cplusplus
3326}  // extern "C"
3327}  // namespace libyuv
3328#endif
3329