1/*
2 *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC Neon armv8 64 bit.
19#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21// Read 8 Y, 4 U and 4 V from 422
22#define READYUV422 \
23  MEMACCESS(0)     \
24  "ld1        {v0.8b}, [%0], #8              \n"                             \
25    MEMACCESS(1)                                                               \
26    "ld1        {v1.s}[0], [%1], #4            \n"                             \
27    MEMACCESS(2)                                                               \
28    "ld1        {v1.s}[1], [%2], #4            \n"
29
30// Read 8 Y, 8 U and 8 V from 444
31#define READYUV444 \
32  MEMACCESS(0)     \
33  "ld1        {v0.8b}, [%0], #8              \n"                             \
34    MEMACCESS(1)                                                               \
35    "ld1        {v1.d}[0], [%1], #8            \n"                             \
36    MEMACCESS(2)                                                               \
37    "ld1        {v1.d}[1], [%2], #8            \n"                             \
38    "uaddlp     v1.8h, v1.16b                  \n"                             \
39    "rshrn      v1.8b, v1.8h, #1               \n"
40
41// Read 8 Y, and set 4 U and 4 V to 128
42#define READYUV400                               \
43  MEMACCESS(0)                                   \
44  "ld1        {v0.8b}, [%0], #8              \n" \
45  "movi       v1.8b , #128                   \n"
46
47// Read 8 Y and 4 UV from NV12
48#define READNV12 \
49  MEMACCESS(0)   \
50  "ld1        {v0.8b}, [%0], #8              \n"                             \
51    MEMACCESS(1)                                                               \
52    "ld1        {v2.8b}, [%1], #8              \n"                             \
53    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
54    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
55    "ins        v1.s[1], v3.s[0]               \n"
56
57// Read 8 Y and 4 VU from NV21
58#define READNV21 \
59  MEMACCESS(0)   \
60  "ld1        {v0.8b}, [%0], #8              \n"                             \
61    MEMACCESS(1)                                                               \
62    "ld1        {v2.8b}, [%1], #8              \n"                             \
63    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
64    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
65    "ins        v1.s[1], v3.s[0]               \n"
66
67// Read 8 YUY2
68#define READYUY2                                 \
69  MEMACCESS(0)                                   \
70  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
71  "uzp2       v3.8b, v1.8b, v1.8b            \n" \
72  "uzp1       v1.8b, v1.8b, v1.8b            \n" \
73  "ins        v1.s[1], v3.s[0]               \n"
74
75// Read 8 UYVY
76#define READUYVY                                 \
77  MEMACCESS(0)                                   \
78  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
79  "orr        v0.8b, v3.8b, v3.8b            \n" \
80  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
81  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
82  "ins        v1.s[1], v3.s[0]               \n"
83
84#define YUVTORGB_SETUP                           \
85  "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
86  "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
87  "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
88  "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
89  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
90  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
91
92#define YUVTORGB(vR, vG, vB)                                        \
93  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
94  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
95  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
96  "ushll      v0.4s, v0.4h, #0               \n"                    \
97  "mul        v3.4s, v3.4s, v31.4s           \n"                    \
98  "mul        v0.4s, v0.4s, v31.4s           \n"                    \
99  "sqshrun    v0.4h, v0.4s, #16              \n"                    \
100  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
101  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
102  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
103  "uxtl       v2.8h, v2.8b                   \n"                    \
104  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
105  "mul        v3.8h, v1.8h, v27.8h           \n"                    \
106  "mul        v5.8h, v1.8h, v29.8h           \n"                    \
107  "mul        v6.8h, v2.8h, v30.8h           \n"                    \
108  "mul        v7.8h, v2.8h, v28.8h           \n"                    \
109  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
110  "sqadd      " #vB                                                 \
111  ".8h, v24.8h, v0.8h      \n" /* B */                              \
112  "sqadd      " #vG                                                 \
113  ".8h, v25.8h, v0.8h      \n" /* G */                              \
114  "sqadd      " #vR                                                 \
115  ".8h, v26.8h, v0.8h      \n" /* R */                              \
116  "sqadd      " #vB ".8h, " #vB                                     \
117  ".8h, v3.8h  \n" /* B */                                          \
118  "sqsub      " #vG ".8h, " #vG                                     \
119  ".8h, v6.8h  \n" /* G */                                          \
120  "sqadd      " #vR ".8h, " #vR                                     \
121  ".8h, v7.8h  \n" /* R */                                          \
122  "sqshrun    " #vB ".8b, " #vB                                     \
123  ".8h, #6     \n" /* B */                                          \
124  "sqshrun    " #vG ".8b, " #vG                                     \
125  ".8h, #6     \n"                               /* G */            \
126  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
127
128void I444ToARGBRow_NEON(const uint8* src_y,
129                        const uint8* src_u,
130                        const uint8* src_v,
131                        uint8* dst_argb,
132                        const struct YuvConstants* yuvconstants,
133                        int width) {
134  asm volatile (
135    YUVTORGB_SETUP
136    "movi       v23.8b, #255                   \n" /* A */
137  "1:                                          \n"
138    READYUV444
139    YUVTORGB(v22, v21, v20)
140    "subs       %w4, %w4, #8                   \n"
141    MEMACCESS(3)
142    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
143    "b.gt       1b                             \n"
144    : "+r"(src_y),     // %0
145      "+r"(src_u),     // %1
146      "+r"(src_v),     // %2
147      "+r"(dst_argb),  // %3
148      "+r"(width)      // %4
149    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
150      [kUVToG]"r"(&yuvconstants->kUVToG),
151      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
152      [kYToRgb]"r"(&yuvconstants->kYToRgb)
153    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
154      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
155  );
156}
157
158void I422ToARGBRow_NEON(const uint8* src_y,
159                        const uint8* src_u,
160                        const uint8* src_v,
161                        uint8* dst_argb,
162                        const struct YuvConstants* yuvconstants,
163                        int width) {
164  asm volatile (
165    YUVTORGB_SETUP
166    "movi       v23.8b, #255                   \n" /* A */
167  "1:                                          \n"
168    READYUV422
169    YUVTORGB(v22, v21, v20)
170    "subs       %w4, %w4, #8                   \n"
171    MEMACCESS(3)
172    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
173    "b.gt       1b                             \n"
174    : "+r"(src_y),     // %0
175      "+r"(src_u),     // %1
176      "+r"(src_v),     // %2
177      "+r"(dst_argb),  // %3
178      "+r"(width)      // %4
179    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
180      [kUVToG]"r"(&yuvconstants->kUVToG),
181      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
182      [kYToRgb]"r"(&yuvconstants->kYToRgb)
183    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
184      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
185  );
186}
187
188void I422AlphaToARGBRow_NEON(const uint8* src_y,
189                             const uint8* src_u,
190                             const uint8* src_v,
191                             const uint8* src_a,
192                             uint8* dst_argb,
193                             const struct YuvConstants* yuvconstants,
194                             int width) {
195  asm volatile (
196    YUVTORGB_SETUP
197  "1:                                          \n"
198    READYUV422
199    YUVTORGB(v22, v21, v20)
200    MEMACCESS(3)
201    "ld1        {v23.8b}, [%3], #8             \n"
202    "subs       %w5, %w5, #8                   \n"
203    MEMACCESS(4)
204    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
205    "b.gt       1b                             \n"
206    : "+r"(src_y),     // %0
207      "+r"(src_u),     // %1
208      "+r"(src_v),     // %2
209      "+r"(src_a),     // %3
210      "+r"(dst_argb),  // %4
211      "+r"(width)      // %5
212    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
213      [kUVToG]"r"(&yuvconstants->kUVToG),
214      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
215      [kYToRgb]"r"(&yuvconstants->kYToRgb)
216    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
217      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
218  );
219}
220
221void I422ToRGBARow_NEON(const uint8* src_y,
222                        const uint8* src_u,
223                        const uint8* src_v,
224                        uint8* dst_rgba,
225                        const struct YuvConstants* yuvconstants,
226                        int width) {
227  asm volatile (
228    YUVTORGB_SETUP
229    "movi       v20.8b, #255                   \n" /* A */
230  "1:                                          \n"
231    READYUV422
232    YUVTORGB(v23, v22, v21)
233    "subs       %w4, %w4, #8                   \n"
234    MEMACCESS(3)
235    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
236    "b.gt       1b                             \n"
237    : "+r"(src_y),     // %0
238      "+r"(src_u),     // %1
239      "+r"(src_v),     // %2
240      "+r"(dst_rgba),  // %3
241      "+r"(width)      // %4
242    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
243      [kUVToG]"r"(&yuvconstants->kUVToG),
244      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
245      [kYToRgb]"r"(&yuvconstants->kYToRgb)
246    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
247      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
248  );
249}
250
251void I422ToRGB24Row_NEON(const uint8* src_y,
252                         const uint8* src_u,
253                         const uint8* src_v,
254                         uint8* dst_rgb24,
255                         const struct YuvConstants* yuvconstants,
256                         int width) {
257  asm volatile (
258    YUVTORGB_SETUP
259  "1:                                          \n"
260    READYUV422
261    YUVTORGB(v22, v21, v20)
262    "subs       %w4, %w4, #8                   \n"
263    MEMACCESS(3)
264    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
265    "b.gt       1b                             \n"
266    : "+r"(src_y),     // %0
267      "+r"(src_u),     // %1
268      "+r"(src_v),     // %2
269      "+r"(dst_rgb24), // %3
270      "+r"(width)      // %4
271    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
272      [kUVToG]"r"(&yuvconstants->kUVToG),
273      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
274      [kYToRgb]"r"(&yuvconstants->kYToRgb)
275    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
276      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
277  );
278}
279
280#define ARGBTORGB565                                                        \
281  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
282  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
283  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
284  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
285  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
286
287void I422ToRGB565Row_NEON(const uint8* src_y,
288                          const uint8* src_u,
289                          const uint8* src_v,
290                          uint8* dst_rgb565,
291                          const struct YuvConstants* yuvconstants,
292                          int width) {
293  asm volatile (
294    YUVTORGB_SETUP
295  "1:                                          \n"
296    READYUV422
297    YUVTORGB(v22, v21, v20)
298    "subs       %w4, %w4, #8                   \n"
299    ARGBTORGB565
300    MEMACCESS(3)
301    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
302    "b.gt       1b                             \n"
303    : "+r"(src_y),    // %0
304      "+r"(src_u),    // %1
305      "+r"(src_v),    // %2
306      "+r"(dst_rgb565),  // %3
307      "+r"(width)     // %4
308    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
309      [kUVToG]"r"(&yuvconstants->kUVToG),
310      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
311      [kYToRgb]"r"(&yuvconstants->kYToRgb)
312    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
313      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
314  );
315}
316
317#define ARGBTOARGB1555                                                      \
318  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
319  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
320  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
321  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
322  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
323  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
324  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
325
326void I422ToARGB1555Row_NEON(const uint8* src_y,
327                            const uint8* src_u,
328                            const uint8* src_v,
329                            uint8* dst_argb1555,
330                            const struct YuvConstants* yuvconstants,
331                            int width) {
332  asm volatile (
333    YUVTORGB_SETUP
334    "movi       v23.8b, #255                   \n"
335  "1:                                          \n"
336    READYUV422
337    YUVTORGB(v22, v21, v20)
338    "subs       %w4, %w4, #8                   \n"
339    ARGBTOARGB1555
340    MEMACCESS(3)
341    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
342    "b.gt       1b                             \n"
343    : "+r"(src_y),    // %0
344      "+r"(src_u),    // %1
345      "+r"(src_v),    // %2
346      "+r"(dst_argb1555),  // %3
347      "+r"(width)     // %4
348    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
349      [kUVToG]"r"(&yuvconstants->kUVToG),
350      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
351      [kYToRgb]"r"(&yuvconstants->kYToRgb)
352    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
353      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
354  );
355}
356
357#define ARGBTOARGB4444                                                       \
358  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
359  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
360  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
361  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
362  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
363  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
364  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
365  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
366
367void I422ToARGB4444Row_NEON(const uint8* src_y,
368                            const uint8* src_u,
369                            const uint8* src_v,
370                            uint8* dst_argb4444,
371                            const struct YuvConstants* yuvconstants,
372                            int width) {
373  asm volatile (
374    YUVTORGB_SETUP
375    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
376  "1:                                          \n"
377    READYUV422
378    YUVTORGB(v22, v21, v20)
379    "subs       %w4, %w4, #8                   \n"
380    "movi       v23.8b, #255                   \n"
381    ARGBTOARGB4444
382    MEMACCESS(3)
383    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
384    "b.gt       1b                             \n"
385    : "+r"(src_y),    // %0
386      "+r"(src_u),    // %1
387      "+r"(src_v),    // %2
388      "+r"(dst_argb4444),  // %3
389      "+r"(width)     // %4
390    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
391      [kUVToG]"r"(&yuvconstants->kUVToG),
392      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
393      [kYToRgb]"r"(&yuvconstants->kYToRgb)
394    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
395      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
396  );
397}
398
399void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
400  asm volatile (
401    YUVTORGB_SETUP
402    "movi       v23.8b, #255                   \n"
403  "1:                                          \n"
404    READYUV400
405    YUVTORGB(v22, v21, v20)
406    "subs       %w2, %w2, #8                   \n"
407    MEMACCESS(1)
408    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
409    "b.gt       1b                             \n"
410    : "+r"(src_y),     // %0
411      "+r"(dst_argb),  // %1
412      "+r"(width)      // %2
413    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
414      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
415      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
416      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
417    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
418      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
419  );
420}
421
422void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
423  asm volatile (
424    "movi       v23.8b, #255                   \n"
425  "1:                                          \n"
426    MEMACCESS(0)
427    "ld1        {v20.8b}, [%0], #8             \n"
428    "orr        v21.8b, v20.8b, v20.8b         \n"
429    "orr        v22.8b, v20.8b, v20.8b         \n"
430    "subs       %w2, %w2, #8                   \n"
431    MEMACCESS(1)
432    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
433    "b.gt       1b                             \n"
434    : "+r"(src_y),     // %0
435      "+r"(dst_argb),  // %1
436      "+r"(width)      // %2
437    :
438    : "cc", "memory", "v20", "v21", "v22", "v23"
439  );
440}
441
442void NV12ToARGBRow_NEON(const uint8* src_y,
443                        const uint8* src_uv,
444                        uint8* dst_argb,
445                        const struct YuvConstants* yuvconstants,
446                        int width) {
447  asm volatile (
448    YUVTORGB_SETUP
449    "movi       v23.8b, #255                   \n"
450  "1:                                          \n"
451    READNV12
452    YUVTORGB(v22, v21, v20)
453    "subs       %w3, %w3, #8                   \n"
454    MEMACCESS(2)
455    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
456    "b.gt       1b                             \n"
457    : "+r"(src_y),     // %0
458      "+r"(src_uv),    // %1
459      "+r"(dst_argb),  // %2
460      "+r"(width)      // %3
461    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
462      [kUVToG]"r"(&yuvconstants->kUVToG),
463      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
464      [kYToRgb]"r"(&yuvconstants->kYToRgb)
465    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
466      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
467  );
468}
469
470void NV21ToARGBRow_NEON(const uint8* src_y,
471                        const uint8* src_vu,
472                        uint8* dst_argb,
473                        const struct YuvConstants* yuvconstants,
474                        int width) {
475  asm volatile (
476    YUVTORGB_SETUP
477    "movi       v23.8b, #255                   \n"
478  "1:                                          \n"
479    READNV21
480    YUVTORGB(v22, v21, v20)
481    "subs       %w3, %w3, #8                   \n"
482    MEMACCESS(2)
483    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
484    "b.gt       1b                             \n"
485    : "+r"(src_y),     // %0
486      "+r"(src_vu),    // %1
487      "+r"(dst_argb),  // %2
488      "+r"(width)      // %3
489    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
490      [kUVToG]"r"(&yuvconstants->kUVToG),
491      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
492      [kYToRgb]"r"(&yuvconstants->kYToRgb)
493    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
494      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
495  );
496}
497
498void NV12ToRGB565Row_NEON(const uint8* src_y,
499                          const uint8* src_uv,
500                          uint8* dst_rgb565,
501                          const struct YuvConstants* yuvconstants,
502                          int width) {
503  asm volatile (
504    YUVTORGB_SETUP
505  "1:                                          \n"
506    READNV12
507    YUVTORGB(v22, v21, v20)
508    "subs       %w3, %w3, #8                   \n"
509    ARGBTORGB565
510    MEMACCESS(2)
511    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
512    "b.gt       1b                             \n"
513    : "+r"(src_y),     // %0
514      "+r"(src_uv),    // %1
515      "+r"(dst_rgb565),  // %2
516      "+r"(width)      // %3
517    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
518      [kUVToG]"r"(&yuvconstants->kUVToG),
519      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
520      [kYToRgb]"r"(&yuvconstants->kYToRgb)
521    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
522      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
523  );
524}
525
526void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
527                        uint8* dst_argb,
528                        const struct YuvConstants* yuvconstants,
529                        int width) {
530  asm volatile (
531    YUVTORGB_SETUP
532    "movi       v23.8b, #255                   \n"
533  "1:                                          \n"
534    READYUY2
535    YUVTORGB(v22, v21, v20)
536    "subs       %w2, %w2, #8                   \n"
537    MEMACCESS(1)
538    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
539    "b.gt       1b                             \n"
540    : "+r"(src_yuy2),  // %0
541      "+r"(dst_argb),  // %1
542      "+r"(width)      // %2
543    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
544      [kUVToG]"r"(&yuvconstants->kUVToG),
545      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
546      [kYToRgb]"r"(&yuvconstants->kYToRgb)
547    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
548      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
549  );
550}
551
552void UYVYToARGBRow_NEON(const uint8* src_uyvy,
553                        uint8* dst_argb,
554                        const struct YuvConstants* yuvconstants,
555                        int width) {
556  asm volatile (
557    YUVTORGB_SETUP
558    "movi       v23.8b, #255                   \n"
559  "1:                                          \n"
560    READUYVY
561    YUVTORGB(v22, v21, v20)
562    "subs       %w2, %w2, #8                   \n"
563    MEMACCESS(1)
564    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
565    "b.gt       1b                             \n"
566    : "+r"(src_uyvy),  // %0
567      "+r"(dst_argb),  // %1
568      "+r"(width)      // %2
569    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
570      [kUVToG]"r"(&yuvconstants->kUVToG),
571      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
572      [kYToRgb]"r"(&yuvconstants->kYToRgb)
573    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
574      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
575  );
576}
577
578// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
579void SplitUVRow_NEON(const uint8* src_uv,
580                     uint8* dst_u,
581                     uint8* dst_v,
582                     int width) {
583  asm volatile (
584  "1:                                          \n"
585    MEMACCESS(0)
586    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
587    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
588    MEMACCESS(1)
589    "st1        {v0.16b}, [%1], #16            \n"  // store U
590    MEMACCESS(2)
591    "st1        {v1.16b}, [%2], #16            \n"  // store V
592    "b.gt       1b                             \n"
593    : "+r"(src_uv),  // %0
594      "+r"(dst_u),   // %1
595      "+r"(dst_v),   // %2
596      "+r"(width)    // %3  // Output registers
597    :                       // Input registers
598    : "cc", "memory", "v0", "v1"  // Clobber List
599  );
600}
601
602// Reads 16 U's and V's and writes out 16 pairs of UV.
603void MergeUVRow_NEON(const uint8* src_u,
604                     const uint8* src_v,
605                     uint8* dst_uv,
606                     int width) {
607  asm volatile (
608  "1:                                          \n"
609    MEMACCESS(0)
610    "ld1        {v0.16b}, [%0], #16            \n"  // load U
611    MEMACCESS(1)
612    "ld1        {v1.16b}, [%1], #16            \n"  // load V
613    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
614    MEMACCESS(2)
615    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
616    "b.gt       1b                             \n"
617    :
618      "+r"(src_u),   // %0
619      "+r"(src_v),   // %1
620      "+r"(dst_uv),  // %2
621      "+r"(width)    // %3  // Output registers
622    :                       // Input registers
623    : "cc", "memory", "v0", "v1"  // Clobber List
624  );
625}
626
627// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
628void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
629  asm volatile (
630  "1:                                          \n"
631    MEMACCESS(0)
632    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
633    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
634    MEMACCESS(1)
635    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
636    "b.gt       1b                             \n"
637  : "+r"(src),   // %0
638    "+r"(dst),   // %1
639    "+r"(count)  // %2  // Output registers
640  :                     // Input registers
641  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
642  );
643}
644
645// SetRow writes 'count' bytes using an 8 bit value repeated.
646void SetRow_NEON(uint8* dst, uint8 v8, int count) {
647  asm volatile (
648    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
649  "1:                                          \n"
650    "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
651    MEMACCESS(0)
652    "st1        {v0.16b}, [%0], #16            \n"  // store
653    "b.gt       1b                             \n"
654  : "+r"(dst),   // %0
655    "+r"(count)  // %1
656  : "r"(v8)      // %2
657  : "cc", "memory", "v0"
658  );
659}
660
661void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
662  asm volatile (
663    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
664  "1:                                          \n"
665    "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
666    MEMACCESS(0)
667    "st1        {v0.16b}, [%0], #16            \n"  // store
668    "b.gt       1b                             \n"
669  : "+r"(dst),   // %0
670    "+r"(count)  // %1
671  : "r"(v32)     // %2
672  : "cc", "memory", "v0"
673  );
674}
675
676void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
677  asm volatile (
678    // Start at end of source row.
679    "add        %0, %0, %w2, sxtw              \n"
680    "sub        %0, %0, #16                    \n"
681  "1:                                          \n"
682    MEMACCESS(0)
683    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
684    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
685    "rev64      v0.16b, v0.16b                 \n"
686    MEMACCESS(1)
687    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
688    MEMACCESS(1)
689    "st1        {v0.D}[0], [%1], #8            \n"
690    "b.gt       1b                             \n"
691  : "+r"(src),   // %0
692    "+r"(dst),   // %1
693    "+r"(width)  // %2
694  : "r"((ptrdiff_t)-16)    // %3
695  : "cc", "memory", "v0"
696  );
697}
698
699void MirrorUVRow_NEON(const uint8* src_uv,
700                      uint8* dst_u,
701                      uint8* dst_v,
702                      int width) {
703  asm volatile (
704    // Start at end of source row.
705    "add        %0, %0, %w3, sxtw #1           \n"
706    "sub        %0, %0, #16                    \n"
707  "1:                                          \n"
708    MEMACCESS(0)
709    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
710    "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
711    "rev64      v0.8b, v0.8b                   \n"
712    "rev64      v1.8b, v1.8b                   \n"
713    MEMACCESS(1)
714    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
715    MEMACCESS(2)
716    "st1        {v1.8b}, [%2], #8              \n"
717    "b.gt       1b                             \n"
718  : "+r"(src_uv),  // %0
719    "+r"(dst_u),   // %1
720    "+r"(dst_v),   // %2
721    "+r"(width)    // %3
722  : "r"((ptrdiff_t)-16)      // %4
723  : "cc", "memory", "v0", "v1"
724  );
725}
726
727void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
728  asm volatile (
729  // Start at end of source row.
730    "add        %0, %0, %w2, sxtw #2           \n"
731    "sub        %0, %0, #16                    \n"
732  "1:                                          \n"
733    MEMACCESS(0)
734    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
735    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
736    "rev64      v0.4s, v0.4s                   \n"
737    MEMACCESS(1)
738    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
739    MEMACCESS(1)
740    "st1        {v0.D}[0], [%1], #8            \n"
741    "b.gt       1b                             \n"
742  : "+r"(src),   // %0
743    "+r"(dst),   // %1
744    "+r"(width)  // %2
745  : "r"((ptrdiff_t)-16)    // %3
746  : "cc", "memory", "v0"
747  );
748}
749
750void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
751  asm volatile (
752    "movi       v4.8b, #255                    \n"  // Alpha
753  "1:                                          \n"
754    MEMACCESS(0)
755    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
756    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
757    MEMACCESS(1)
758    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
759    "b.gt       1b                             \n"
760  : "+r"(src_rgb24),  // %0
761    "+r"(dst_argb),   // %1
762    "+r"(width)       // %2
763  :
764  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
765  );
766}
767
768void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
769  asm volatile (
770    "movi       v5.8b, #255                    \n"  // Alpha
771  "1:                                          \n"
772    MEMACCESS(0)
773    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
774    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
775    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
776    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
777    MEMACCESS(1)
778    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
779    "b.gt       1b                             \n"
780  : "+r"(src_raw),   // %0
781    "+r"(dst_argb),  // %1
782    "+r"(width)      // %2
783  :
784  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
785  );
786}
787
788void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
789  asm volatile (
790  "1:                                          \n"
791    MEMACCESS(0)
792    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
793    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
794    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
795    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
796    MEMACCESS(1)
797    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
798    "b.gt       1b                             \n"
799  : "+r"(src_raw),    // %0
800    "+r"(dst_rgb24),  // %1
801    "+r"(width)       // %2
802  :
803  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
804  );
805}
806
807#define RGB565TOARGB                                                        \
808  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
809  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
810  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
811  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
812  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
813  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
814  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
815  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
816  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
817  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
818  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
819
820void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
821  asm volatile (
822    "movi       v3.8b, #255                    \n"  // Alpha
823  "1:                                          \n"
824    MEMACCESS(0)
825    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
826    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
827    RGB565TOARGB
828    MEMACCESS(1)
829    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
830    "b.gt       1b                             \n"
831  : "+r"(src_rgb565),  // %0
832    "+r"(dst_argb),    // %1
833    "+r"(width)          // %2
834  :
835  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
836  );
837}
838
839#define ARGB1555TOARGB                                                      \
840  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
841  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
842  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
843                                                                            \
844  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
845  "xtn2       v3.16b, v2.8h                  \n"                            \
846                                                                            \
847  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
848  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
849                                                                            \
850  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
851  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
852  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
853                                                                            \
854  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
855  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
856  "dup        v1.2D, v0.D[1]                 \n"                            \
857  "dup        v3.2D, v2.D[1]                 \n"
858
859// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
860#define RGB555TOARGB                                                        \
861  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
862  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
863  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
864                                                                            \
865  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
866  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
867                                                                            \
868  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
869  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
870  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
871                                                                            \
872  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
873  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
874  "dup        v1.2D, v0.D[1]                 \n" /* G */
875
876void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
877                            uint8* dst_argb,
878                            int width) {
879  asm volatile (
880    "movi       v3.8b, #255                    \n"  // Alpha
881  "1:                                          \n"
882    MEMACCESS(0)
883    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
884    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
885    ARGB1555TOARGB
886    MEMACCESS(1)
887    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
888    "b.gt       1b                             \n"
889  : "+r"(src_argb1555),  // %0
890    "+r"(dst_argb),    // %1
891    "+r"(width)          // %2
892  :
893  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
894  );
895}
896
897#define ARGB4444TOARGB                                                      \
898  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
899  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
900  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
901  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
902  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
903  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
904  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
905  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
906  "dup        v0.2D, v2.D[1]                 \n"                            \
907  "dup        v1.2D, v3.D[1]                 \n"
908
909void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
910                            uint8* dst_argb,
911                            int width) {
912  asm volatile (
913  "1:                                          \n"
914    MEMACCESS(0)
915    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
916    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
917    ARGB4444TOARGB
918    MEMACCESS(1)
919    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
920    "b.gt       1b                             \n"
921  : "+r"(src_argb4444),  // %0
922    "+r"(dst_argb),    // %1
923    "+r"(width)          // %2
924  :
925  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
926  );
927}
928
929void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
930  asm volatile (
931  "1:                                          \n"
932    MEMACCESS(0)
933    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
934    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
935    MEMACCESS(1)
936    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
937    "b.gt       1b                             \n"
938  : "+r"(src_argb),   // %0
939    "+r"(dst_rgb24),  // %1
940    "+r"(width)         // %2
941  :
942  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
943  );
944}
945
946void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
947  asm volatile (
948  "1:                                          \n"
949    MEMACCESS(0)
950    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
951    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
952    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
953    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
954    MEMACCESS(1)
955    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
956    "b.gt       1b                             \n"
957  : "+r"(src_argb),  // %0
958    "+r"(dst_raw),   // %1
959    "+r"(width)        // %2
960  :
961  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
962  );
963}
964
965void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
966  asm volatile (
967  "1:                                          \n"
968    MEMACCESS(0)
969    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
970    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
971    MEMACCESS(1)
972    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
973    "b.gt       1b                             \n"
974  : "+r"(src_yuy2),  // %0
975    "+r"(dst_y),     // %1
976    "+r"(width)        // %2
977  :
978  : "cc", "memory", "v0", "v1"  // Clobber List
979  );
980}
981
982void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
983  asm volatile (
984  "1:                                          \n"
985    MEMACCESS(0)
986    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
987    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
988    MEMACCESS(1)
989    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
990    "b.gt       1b                             \n"
991  : "+r"(src_uyvy),  // %0
992    "+r"(dst_y),     // %1
993    "+r"(width)        // %2
994  :
995  : "cc", "memory", "v0", "v1"  // Clobber List
996  );
997}
998
999void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
1000                         uint8* dst_u,
1001                         uint8* dst_v,
1002                         int width) {
1003  asm volatile (
1004  "1:                                          \n"
1005    MEMACCESS(0)
1006    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
1007    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1008    MEMACCESS(1)
1009    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
1010    MEMACCESS(2)
1011    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
1012    "b.gt       1b                             \n"
1013  : "+r"(src_yuy2),  // %0
1014    "+r"(dst_u),     // %1
1015    "+r"(dst_v),     // %2
1016    "+r"(width)        // %3
1017  :
1018  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1019  );
1020}
1021
1022void UYVYToUV422Row_NEON(const uint8* src_uyvy,
1023                         uint8* dst_u,
1024                         uint8* dst_v,
1025                         int width) {
1026  asm volatile (
1027  "1:                                          \n"
1028    MEMACCESS(0)
1029    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
1030    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1031    MEMACCESS(1)
1032    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
1033    MEMACCESS(2)
1034    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
1035    "b.gt       1b                             \n"
1036  : "+r"(src_uyvy),  // %0
1037    "+r"(dst_u),     // %1
1038    "+r"(dst_v),     // %2
1039    "+r"(width)        // %3
1040  :
1041  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1042  );
1043}
1044
1045void YUY2ToUVRow_NEON(const uint8* src_yuy2,
1046                      int stride_yuy2,
1047                      uint8* dst_u,
1048                      uint8* dst_v,
1049                      int width) {
1050  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1051  asm volatile (
1052  "1:                                          \n"
1053    MEMACCESS(0)
1054    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1055    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1056    MEMACCESS(1)
1057    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1058    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
1059    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
1060    MEMACCESS(2)
1061    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
1062    MEMACCESS(3)
1063    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
1064    "b.gt       1b                             \n"
1065  : "+r"(src_yuy2),     // %0
1066    "+r"(src_yuy2b),    // %1
1067    "+r"(dst_u),        // %2
1068    "+r"(dst_v),        // %3
1069    "+r"(width)           // %4
1070  :
1071  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1072    "v5", "v6", "v7"  // Clobber List
1073  );
1074}
1075
1076void UYVYToUVRow_NEON(const uint8* src_uyvy,
1077                      int stride_uyvy,
1078                      uint8* dst_u,
1079                      uint8* dst_v,
1080                      int width) {
1081  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1082  asm volatile (
1083  "1:                                          \n"
1084    MEMACCESS(0)
1085    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1086    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1087    MEMACCESS(1)
1088    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1089    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
1090    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
1091    MEMACCESS(2)
1092    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
1093    MEMACCESS(3)
1094    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
1095    "b.gt       1b                             \n"
1096  : "+r"(src_uyvy),     // %0
1097    "+r"(src_uyvyb),    // %1
1098    "+r"(dst_u),        // %2
1099    "+r"(dst_v),        // %3
1100    "+r"(width)           // %4
1101  :
1102  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1103    "v5", "v6", "v7"  // Clobber List
1104  );
1105}
1106
1107// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1108void ARGBShuffleRow_NEON(const uint8* src_argb,
1109                         uint8* dst_argb,
1110                         const uint8* shuffler,
1111                         int width) {
1112  asm volatile (
1113    MEMACCESS(3)
1114    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
1115  "1:                                          \n"
1116    MEMACCESS(0)
1117    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
1118    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
1119    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
1120    MEMACCESS(1)
1121    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
1122    "b.gt       1b                             \n"
1123  : "+r"(src_argb),  // %0
1124    "+r"(dst_argb),  // %1
1125    "+r"(width)        // %2
1126  : "r"(shuffler)    // %3
1127  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1128  );
1129}
1130
1131void I422ToYUY2Row_NEON(const uint8* src_y,
1132                        const uint8* src_u,
1133                        const uint8* src_v,
1134                        uint8* dst_yuy2,
1135                        int width) {
1136  asm volatile (
1137  "1:                                          \n"
1138    MEMACCESS(0)
1139    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
1140    "orr        v2.8b, v1.8b, v1.8b            \n"
1141    MEMACCESS(1)
1142    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
1143    MEMACCESS(2)
1144    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
1145    "subs       %w4, %w4, #16                  \n"  // 16 pixels
1146    MEMACCESS(3)
1147    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1148    "b.gt       1b                             \n"
1149  : "+r"(src_y),     // %0
1150    "+r"(src_u),     // %1
1151    "+r"(src_v),     // %2
1152    "+r"(dst_yuy2),  // %3
1153    "+r"(width)      // %4
1154  :
1155  : "cc", "memory", "v0", "v1", "v2", "v3"
1156  );
1157}
1158
1159void I422ToUYVYRow_NEON(const uint8* src_y,
1160                        const uint8* src_u,
1161                        const uint8* src_v,
1162                        uint8* dst_uyvy,
1163                        int width) {
1164  asm volatile (
1165  "1:                                          \n"
1166    MEMACCESS(0)
1167    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
1168    "orr        v3.8b, v2.8b, v2.8b            \n"
1169    MEMACCESS(1)
1170    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
1171    MEMACCESS(2)
1172    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
1173    "subs       %w4, %w4, #16                  \n"  // 16 pixels
1174    MEMACCESS(3)
1175    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1176    "b.gt       1b                             \n"
1177  : "+r"(src_y),     // %0
1178    "+r"(src_u),     // %1
1179    "+r"(src_v),     // %2
1180    "+r"(dst_uyvy),  // %3
1181    "+r"(width)      // %4
1182  :
1183  : "cc", "memory", "v0", "v1", "v2", "v3"
1184  );
1185}
1186
1187void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1188  asm volatile (
1189  "1:                                          \n"
1190    MEMACCESS(0)
1191    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1192    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1193    ARGBTORGB565
1194    MEMACCESS(1)
1195    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
1196    "b.gt       1b                             \n"
1197  : "+r"(src_argb),  // %0
1198    "+r"(dst_rgb565),  // %1
1199    "+r"(width)        // %2
1200  :
1201  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1202  );
1203}
1204
1205void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
1206                                uint8* dst_rgb,
1207                                const uint32 dither4,
1208                                int width) {
1209  asm volatile (
1210    "dup        v1.4s, %w2                     \n"  // dither4
1211  "1:                                          \n"
1212    MEMACCESS(1)
1213    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
1214    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1215    "uqadd      v20.8b, v20.8b, v1.8b          \n"
1216    "uqadd      v21.8b, v21.8b, v1.8b          \n"
1217    "uqadd      v22.8b, v22.8b, v1.8b          \n"
1218    ARGBTORGB565
1219    MEMACCESS(0)
1220    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
1221    "b.gt       1b                             \n"
1222  : "+r"(dst_rgb)    // %0
1223  : "r"(src_argb),   // %1
1224    "r"(dither4),    // %2
1225    "r"(width)       // %3
1226  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1227  );
1228}
1229
1230void ARGBToARGB1555Row_NEON(const uint8* src_argb,
1231                            uint8* dst_argb1555,
1232                            int width) {
1233  asm volatile (
1234  "1:                                          \n"
1235    MEMACCESS(0)
1236    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1237    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1238    ARGBTOARGB1555
1239    MEMACCESS(1)
1240    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
1241    "b.gt       1b                             \n"
1242  : "+r"(src_argb),  // %0
1243    "+r"(dst_argb1555),  // %1
1244    "+r"(width)        // %2
1245  :
1246  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1247  );
1248}
1249
1250void ARGBToARGB4444Row_NEON(const uint8* src_argb,
1251                            uint8* dst_argb4444,
1252                            int width) {
1253  asm volatile (
1254    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
1255  "1:                                          \n"
1256    MEMACCESS(0)
1257    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1258    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1259    ARGBTOARGB4444
1260    MEMACCESS(1)
1261    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
1262    "b.gt       1b                             \n"
1263  : "+r"(src_argb),      // %0
1264    "+r"(dst_argb4444),  // %1
1265    "+r"(width)            // %2
1266  :
1267  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1268  );
1269}
1270
1271void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1272  asm volatile (
1273    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1274    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1275    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1276    "movi       v7.8b, #16                     \n"  // Add 16 constant
1277  "1:                                          \n"
1278    MEMACCESS(0)
1279    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1280    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1281    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1282    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1283    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1284    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1285    "uqadd      v0.8b, v0.8b, v7.8b            \n"
1286    MEMACCESS(1)
1287    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1288    "b.gt       1b                             \n"
1289  : "+r"(src_argb),  // %0
1290    "+r"(dst_y),     // %1
1291    "+r"(width)        // %2
1292  :
1293  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1294  );
1295}
1296
1297void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
1298  asm volatile (
1299  "1:                                          \n"
1300    MEMACCESS(0)
1301    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels
1302    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
1303    MEMACCESS(1)
1304    "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
1305    "b.gt       1b                             \n"
1306  : "+r"(src_argb),   // %0
1307    "+r"(dst_a),      // %1
1308    "+r"(width)       // %2
1309  :
1310  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1311  );
1312}
1313
1314void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1315  asm volatile (
1316    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
1317    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
1318    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
1319  "1:                                          \n"
1320    MEMACCESS(0)
1321    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1322    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1323    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1324    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1325    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1326    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
1327    MEMACCESS(1)
1328    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1329    "b.gt       1b                             \n"
1330  : "+r"(src_argb),  // %0
1331    "+r"(dst_y),     // %1
1332    "+r"(width)        // %2
1333  :
1334  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1335  );
1336}
1337
1338// 8x1 pixels.
1339void ARGBToUV444Row_NEON(const uint8* src_argb,
1340                         uint8* dst_u,
1341                         uint8* dst_v,
1342                         int width) {
1343  asm volatile (
1344    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
1345    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
1346    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
1347    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
1348    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
1349    "movi       v29.16b,#0x80                  \n"  // 128.5
1350  "1:                                          \n"
1351    MEMACCESS(0)
1352    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1353    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1354    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
1355    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
1356    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
1357    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
1358
1359    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
1360    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
1361    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
1362    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
1363
1364    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
1365    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1366
1367    MEMACCESS(1)
1368    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1369    MEMACCESS(2)
1370    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1371    "b.gt       1b                             \n"
1372  : "+r"(src_argb),  // %0
1373    "+r"(dst_u),     // %1
1374    "+r"(dst_v),     // %2
1375    "+r"(width)        // %3
1376  :
1377  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1378    "v24", "v25", "v26", "v27", "v28", "v29"
1379  );
1380}
1381
1382#define RGBTOUV_SETUP_REG                                                  \
1383  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
1384  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
1385  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
1386  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
1387  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
1388  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
1389
1390// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1391#define RGBTOUV(QB, QG, QR)                                                 \
1392  "mul        v3.8h, " #QB                                                  \
1393  ",v20.8h          \n" /* B                    */                          \
1394  "mul        v4.8h, " #QR                                                  \
1395  ",v20.8h          \n" /* R                    */                          \
1396  "mls        v3.8h, " #QG                                                  \
1397  ",v21.8h          \n" /* G                    */                          \
1398  "mls        v4.8h, " #QG                                                  \
1399  ",v24.8h          \n" /* G                    */                          \
1400  "mls        v3.8h, " #QR                                                  \
1401  ",v22.8h          \n" /* R                    */                          \
1402  "mls        v4.8h, " #QB                                                  \
1403  ",v23.8h          \n"                          /* B                    */ \
1404  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
1405  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
1406  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
1407  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
1408
1409// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1410// TODO(fbarchard): consider ptrdiff_t for all strides.
1411
1412void ARGBToUVRow_NEON(const uint8* src_argb,
1413                      int src_stride_argb,
1414                      uint8* dst_u,
1415                      uint8* dst_v,
1416                      int width) {
1417  const uint8* src_argb_1 = src_argb + src_stride_argb;
1418  asm volatile (
1419    RGBTOUV_SETUP_REG
1420  "1:                                          \n"
1421    MEMACCESS(0)
1422    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1423    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1424    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1425    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1426
1427    MEMACCESS(1)
1428    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1429    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1430    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1431    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1432
1433    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1434    "urshr      v1.8h, v1.8h, #1               \n"
1435    "urshr      v2.8h, v2.8h, #1               \n"
1436
1437    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1438    RGBTOUV(v0.8h, v1.8h, v2.8h)
1439    MEMACCESS(2)
1440    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1441    MEMACCESS(3)
1442    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1443    "b.gt       1b                             \n"
1444  : "+r"(src_argb),  // %0
1445    "+r"(src_argb_1),  // %1
1446    "+r"(dst_u),     // %2
1447    "+r"(dst_v),     // %3
1448    "+r"(width)        // %4
1449  :
1450  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1451    "v20", "v21", "v22", "v23", "v24", "v25"
1452  );
1453}
1454
1455// TODO(fbarchard): Subsample match C code.
1456void ARGBToUVJRow_NEON(const uint8* src_argb,
1457                       int src_stride_argb,
1458                       uint8* dst_u,
1459                       uint8* dst_v,
1460                       int width) {
1461  const uint8* src_argb_1 = src_argb + src_stride_argb;
1462  asm volatile (
1463    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
1464    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
1465    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
1466    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
1467    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
1468    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1469  "1:                                          \n"
1470    MEMACCESS(0)
1471    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1472    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1473    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1474    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1475    MEMACCESS(1)
1476    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
1477    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1478    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1479    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1480
1481    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1482    "urshr      v1.8h, v1.8h, #1               \n"
1483    "urshr      v2.8h, v2.8h, #1               \n"
1484
1485    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1486    RGBTOUV(v0.8h, v1.8h, v2.8h)
1487    MEMACCESS(2)
1488    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1489    MEMACCESS(3)
1490    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1491    "b.gt       1b                             \n"
1492  : "+r"(src_argb),  // %0
1493    "+r"(src_argb_1),  // %1
1494    "+r"(dst_u),     // %2
1495    "+r"(dst_v),     // %3
1496    "+r"(width)        // %4
1497  :
1498  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1499    "v20", "v21", "v22", "v23", "v24", "v25"
1500  );
1501}
1502
1503void BGRAToUVRow_NEON(const uint8* src_bgra,
1504                      int src_stride_bgra,
1505                      uint8* dst_u,
1506                      uint8* dst_v,
1507                      int width) {
1508  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1509  asm volatile (
1510    RGBTOUV_SETUP_REG
1511  "1:                                          \n"
1512    MEMACCESS(0)
1513    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1514    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
1515    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1516    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
1517    MEMACCESS(1)
1518    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
1519    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
1520    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1521    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
1522
1523    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1524    "urshr      v1.8h, v3.8h, #1               \n"
1525    "urshr      v2.8h, v2.8h, #1               \n"
1526
1527    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1528    RGBTOUV(v0.8h, v1.8h, v2.8h)
1529    MEMACCESS(2)
1530    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1531    MEMACCESS(3)
1532    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1533    "b.gt       1b                             \n"
1534  : "+r"(src_bgra),  // %0
1535    "+r"(src_bgra_1),  // %1
1536    "+r"(dst_u),     // %2
1537    "+r"(dst_v),     // %3
1538    "+r"(width)        // %4
1539  :
1540  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1541    "v20", "v21", "v22", "v23", "v24", "v25"
1542  );
1543}
1544
1545void ABGRToUVRow_NEON(const uint8* src_abgr,
1546                      int src_stride_abgr,
1547                      uint8* dst_u,
1548                      uint8* dst_v,
1549                      int width) {
1550  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1551  asm volatile (
1552    RGBTOUV_SETUP_REG
1553  "1:                                          \n"
1554    MEMACCESS(0)
1555    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1556    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1557    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1558    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1559    MEMACCESS(1)
1560    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1561    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1562    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1563    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1564
1565    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
1566    "urshr      v2.8h, v2.8h, #1               \n"
1567    "urshr      v1.8h, v1.8h, #1               \n"
1568
1569    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1570    RGBTOUV(v0.8h, v2.8h, v1.8h)
1571    MEMACCESS(2)
1572    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1573    MEMACCESS(3)
1574    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1575    "b.gt       1b                             \n"
1576  : "+r"(src_abgr),  // %0
1577    "+r"(src_abgr_1),  // %1
1578    "+r"(dst_u),     // %2
1579    "+r"(dst_v),     // %3
1580    "+r"(width)        // %4
1581  :
1582  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1583    "v20", "v21", "v22", "v23", "v24", "v25"
1584  );
1585}
1586
1587void RGBAToUVRow_NEON(const uint8* src_rgba,
1588                      int src_stride_rgba,
1589                      uint8* dst_u,
1590                      uint8* dst_v,
1591                      int width) {
1592  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1593  asm volatile (
1594    RGBTOUV_SETUP_REG
1595  "1:                                          \n"
1596    MEMACCESS(0)
1597    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1598    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
1599    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1600    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
1601    MEMACCESS(1)
1602    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1603    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
1604    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1605    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
1606
1607    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1608    "urshr      v1.8h, v1.8h, #1               \n"
1609    "urshr      v2.8h, v2.8h, #1               \n"
1610
1611    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1612    RGBTOUV(v0.8h, v1.8h, v2.8h)
1613    MEMACCESS(2)
1614    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1615    MEMACCESS(3)
1616    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1617    "b.gt       1b                             \n"
1618  : "+r"(src_rgba),  // %0
1619    "+r"(src_rgba_1),  // %1
1620    "+r"(dst_u),     // %2
1621    "+r"(dst_v),     // %3
1622    "+r"(width)        // %4
1623  :
1624  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1625    "v20", "v21", "v22", "v23", "v24", "v25"
1626  );
1627}
1628
1629void RGB24ToUVRow_NEON(const uint8* src_rgb24,
1630                       int src_stride_rgb24,
1631                       uint8* dst_u,
1632                       uint8* dst_v,
1633                       int width) {
1634  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1635  asm volatile (
1636    RGBTOUV_SETUP_REG
1637  "1:                                          \n"
1638    MEMACCESS(0)
1639    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
1640    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1641    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1642    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1643    MEMACCESS(1)
1644    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
1645    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1646    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1647    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1648
1649    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1650    "urshr      v1.8h, v1.8h, #1               \n"
1651    "urshr      v2.8h, v2.8h, #1               \n"
1652
1653    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1654    RGBTOUV(v0.8h, v1.8h, v2.8h)
1655    MEMACCESS(2)
1656    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1657    MEMACCESS(3)
1658    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1659    "b.gt       1b                             \n"
1660  : "+r"(src_rgb24),  // %0
1661    "+r"(src_rgb24_1),  // %1
1662    "+r"(dst_u),     // %2
1663    "+r"(dst_v),     // %3
1664    "+r"(width)        // %4
1665  :
1666  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1667    "v20", "v21", "v22", "v23", "v24", "v25"
1668  );
1669}
1670
1671void RAWToUVRow_NEON(const uint8* src_raw,
1672                     int src_stride_raw,
1673                     uint8* dst_u,
1674                     uint8* dst_v,
1675                     int width) {
1676  const uint8* src_raw_1 = src_raw + src_stride_raw;
1677  asm volatile (
1678    RGBTOUV_SETUP_REG
1679  "1:                                          \n"
1680    MEMACCESS(0)
1681    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
1682    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1683    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1684    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1685    MEMACCESS(1)
1686    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
1687    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1688    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1689    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1690
1691    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
1692    "urshr      v1.8h, v1.8h, #1               \n"
1693    "urshr      v0.8h, v0.8h, #1               \n"
1694
1695    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1696    RGBTOUV(v2.8h, v1.8h, v0.8h)
1697    MEMACCESS(2)
1698    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1699    MEMACCESS(3)
1700    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1701    "b.gt       1b                             \n"
1702  : "+r"(src_raw),  // %0
1703    "+r"(src_raw_1),  // %1
1704    "+r"(dst_u),     // %2
1705    "+r"(dst_v),     // %3
1706    "+r"(width)        // %4
1707  :
1708  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1709    "v20", "v21", "v22", "v23", "v24", "v25"
1710  );
1711}
1712
1713// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1714void RGB565ToUVRow_NEON(const uint8* src_rgb565,
1715                        int src_stride_rgb565,
1716                        uint8* dst_u,
1717                        uint8* dst_v,
1718                        int width) {
1719  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1720  asm volatile (
1721    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
1722    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
1723    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
1724    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
1725    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
1726    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1727  "1:                                          \n"
1728    MEMACCESS(0)
1729    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1730    RGB565TOARGB
1731    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1732    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1733    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1734    MEMACCESS(0)
1735    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
1736    RGB565TOARGB
1737    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1738    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1739    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1740
1741    MEMACCESS(1)
1742    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
1743    RGB565TOARGB
1744    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1745    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1746    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1747    MEMACCESS(1)
1748    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
1749    RGB565TOARGB
1750    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1751    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1752    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1753
1754    "ins        v16.D[1], v17.D[0]             \n"
1755    "ins        v18.D[1], v19.D[0]             \n"
1756    "ins        v20.D[1], v21.D[0]             \n"
1757
1758    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1759    "urshr      v5.8h, v18.8h, #1              \n"
1760    "urshr      v6.8h, v20.8h, #1              \n"
1761
1762    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1763    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
1764    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
1765    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
1766    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
1767    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
1768    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
1769    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
1770    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
1771    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
1772    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
1773    MEMACCESS(2)
1774    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1775    MEMACCESS(3)
1776    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1777    "b.gt       1b                             \n"
1778  : "+r"(src_rgb565),  // %0
1779    "+r"(src_rgb565_1),  // %1
1780    "+r"(dst_u),     // %2
1781    "+r"(dst_v),     // %3
1782    "+r"(width)        // %4
1783  :
1784  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1785    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
1786    "v25", "v26", "v27"
1787  );
1788}
1789
1790// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1791void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
1792                          int src_stride_argb1555,
1793                          uint8* dst_u,
1794                          uint8* dst_v,
1795                          int width) {
1796  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1797  asm volatile (
1798    RGBTOUV_SETUP_REG
1799  "1:                                          \n"
1800    MEMACCESS(0)
1801    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1802    RGB555TOARGB
1803    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1804    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1805    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1806    MEMACCESS(0)
1807    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
1808    RGB555TOARGB
1809    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1810    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1811    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1812
1813    MEMACCESS(1)
1814    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
1815    RGB555TOARGB
1816    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1817    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1818    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1819    MEMACCESS(1)
1820    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
1821    RGB555TOARGB
1822    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1823    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1824    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1825
1826    "ins        v16.D[1], v26.D[0]             \n"
1827    "ins        v17.D[1], v27.D[0]             \n"
1828    "ins        v18.D[1], v28.D[0]             \n"
1829
1830    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1831    "urshr      v5.8h, v17.8h, #1              \n"
1832    "urshr      v6.8h, v18.8h, #1              \n"
1833
1834    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1835    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1836    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1837    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1838    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1839    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1840    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1841    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1842    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1843    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1844    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1845    MEMACCESS(2)
1846    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1847    MEMACCESS(3)
1848    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1849    "b.gt       1b                             \n"
1850  : "+r"(src_argb1555),  // %0
1851    "+r"(src_argb1555_1),  // %1
1852    "+r"(dst_u),     // %2
1853    "+r"(dst_v),     // %3
1854    "+r"(width)        // %4
1855  :
1856  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1857    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1858    "v26", "v27", "v28"
1859  );
1860}
1861
1862// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1863void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
1864                          int src_stride_argb4444,
1865                          uint8* dst_u,
1866                          uint8* dst_v,
1867                          int width) {
1868  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1869  asm volatile (
1870    RGBTOUV_SETUP_REG
1871  "1:                                          \n"
1872    MEMACCESS(0)
1873    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1874    ARGB4444TOARGB
1875    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1876    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1877    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1878    MEMACCESS(0)
1879    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
1880    ARGB4444TOARGB
1881    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1882    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1883    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1884
1885    MEMACCESS(1)
1886    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
1887    ARGB4444TOARGB
1888    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1889    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1890    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1891    MEMACCESS(1)
1892    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
1893    ARGB4444TOARGB
1894    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1895    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1896    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1897
1898    "ins        v16.D[1], v26.D[0]             \n"
1899    "ins        v17.D[1], v27.D[0]             \n"
1900    "ins        v18.D[1], v28.D[0]             \n"
1901
1902    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1903    "urshr      v5.8h, v17.8h, #1              \n"
1904    "urshr      v6.8h, v18.8h, #1              \n"
1905
1906    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1907    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1908    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1909    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1910    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1911    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1912    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1913    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1914    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1915    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1916    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1917    MEMACCESS(2)
1918    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1919    MEMACCESS(3)
1920    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1921    "b.gt       1b                             \n"
1922  : "+r"(src_argb4444),  // %0
1923    "+r"(src_argb4444_1),  // %1
1924    "+r"(dst_u),     // %2
1925    "+r"(dst_v),     // %3
1926    "+r"(width)        // %4
1927  :
1928  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1929    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1930    "v26", "v27", "v28"
1931
1932  );
1933}
1934
1935void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
1936  asm volatile (
1937    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
1938    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
1939    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
1940    "movi       v27.8b, #16                    \n"  // Add 16 constant
1941  "1:                                          \n"
1942    MEMACCESS(0)
1943    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1944    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1945    RGB565TOARGB
1946    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
1947    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
1948    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
1949    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1950    "uqadd      v0.8b, v0.8b, v27.8b           \n"
1951    MEMACCESS(1)
1952    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1953    "b.gt       1b                             \n"
1954  : "+r"(src_rgb565),  // %0
1955    "+r"(dst_y),       // %1
1956    "+r"(width)          // %2
1957  :
1958  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
1959    "v24", "v25", "v26", "v27"
1960  );
1961}
1962
1963void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
1964  asm volatile (
1965    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1966    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1967    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1968    "movi       v7.8b, #16                     \n"  // Add 16 constant
1969  "1:                                          \n"
1970    MEMACCESS(0)
1971    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1972    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1973    ARGB1555TOARGB
1974    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1975    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1976    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1977    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1978    "uqadd      v0.8b, v0.8b, v7.8b            \n"
1979    MEMACCESS(1)
1980    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1981    "b.gt       1b                             \n"
1982  : "+r"(src_argb1555),  // %0
1983    "+r"(dst_y),         // %1
1984    "+r"(width)            // %2
1985  :
1986  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1987  );
1988}
1989
1990void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
1991  asm volatile (
1992    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
1993    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
1994    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
1995    "movi       v27.8b, #16                    \n"  // Add 16 constant
1996  "1:                                          \n"
1997    MEMACCESS(0)
1998    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1999    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2000    ARGB4444TOARGB
2001    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
2002    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
2003    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
2004    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2005    "uqadd      v0.8b, v0.8b, v27.8b           \n"
2006    MEMACCESS(1)
2007    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2008    "b.gt       1b                             \n"
2009  : "+r"(src_argb4444),  // %0
2010    "+r"(dst_y),         // %1
2011    "+r"(width)            // %2
2012  :
2013  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2014  );
2015}
2016
2017void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2018  asm volatile (
2019    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2020    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2021    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2022    "movi       v7.8b, #16                     \n"  // Add 16 constant
2023  "1:                                          \n"
2024    MEMACCESS(0)
2025    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2026    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2027    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
2028    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2029    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
2030    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2031    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2032    MEMACCESS(1)
2033    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2034    "b.gt       1b                             \n"
2035  : "+r"(src_bgra),  // %0
2036    "+r"(dst_y),     // %1
2037    "+r"(width)        // %2
2038  :
2039  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2040  );
2041}
2042
2043void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2044  asm volatile (
2045    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2046    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2047    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2048    "movi       v7.8b, #16                     \n"  // Add 16 constant
2049  "1:                                          \n"
2050    MEMACCESS(0)
2051    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2052    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2053    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
2054    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2055    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
2056    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2057    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2058    MEMACCESS(1)
2059    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2060    "b.gt       1b                             \n"
2061  : "+r"(src_abgr),  // %0
2062    "+r"(dst_y),     // %1
2063    "+r"(width)        // %2
2064  :
2065  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2066  );
2067}
2068
2069void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2070  asm volatile (
2071    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2072    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2073    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2074    "movi       v7.8b, #16                     \n"  // Add 16 constant
2075  "1:                                          \n"
2076    MEMACCESS(0)
2077    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2078    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2079    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
2080    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2081    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
2082    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2083    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2084    MEMACCESS(1)
2085    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2086    "b.gt       1b                             \n"
2087  : "+r"(src_rgba),  // %0
2088    "+r"(dst_y),     // %1
2089    "+r"(width)        // %2
2090  :
2091  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2092  );
2093}
2094
2095void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2096  asm volatile (
2097    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2098    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2099    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2100    "movi       v7.8b, #16                     \n"  // Add 16 constant
2101  "1:                                          \n"
2102    MEMACCESS(0)
2103    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2104    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2105    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2106    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2107    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2108    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2109    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2110    MEMACCESS(1)
2111    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2112    "b.gt       1b                             \n"
2113  : "+r"(src_rgb24),  // %0
2114    "+r"(dst_y),      // %1
2115    "+r"(width)         // %2
2116  :
2117  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2118  );
2119}
2120
2121void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2122  asm volatile (
2123    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2124    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2125    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2126    "movi       v7.8b, #16                     \n"  // Add 16 constant
2127  "1:                                          \n"
2128    MEMACCESS(0)
2129    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2130    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2131    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2132    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2133    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2134    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2135    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2136    MEMACCESS(1)
2137    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2138    "b.gt       1b                             \n"
2139  : "+r"(src_raw),  // %0
2140    "+r"(dst_y),    // %1
2141    "+r"(width)       // %2
2142  :
2143  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2144  );
2145}
2146
2147// Bilinear filter 16x2 -> 16x1
2148void InterpolateRow_NEON(uint8* dst_ptr,
2149                         const uint8* src_ptr,
2150                         ptrdiff_t src_stride,
2151                         int dst_width,
2152                         int source_y_fraction) {
2153  int y1_fraction = source_y_fraction;
2154  int y0_fraction = 256 - y1_fraction;
2155  const uint8* src_ptr1 = src_ptr + src_stride;
2156  asm volatile (
2157    "cmp        %w4, #0                        \n"
2158    "b.eq       100f                           \n"
2159    "cmp        %w4, #128                      \n"
2160    "b.eq       50f                            \n"
2161
2162    "dup        v5.16b, %w4                    \n"
2163    "dup        v4.16b, %w5                    \n"
2164    // General purpose row blend.
2165  "1:                                          \n"
2166    MEMACCESS(1)
2167    "ld1        {v0.16b}, [%1], #16            \n"
2168    MEMACCESS(2)
2169    "ld1        {v1.16b}, [%2], #16            \n"
2170    "subs       %w3, %w3, #16                  \n"
2171    "umull      v2.8h, v0.8b,  v4.8b           \n"
2172    "umull2     v3.8h, v0.16b, v4.16b          \n"
2173    "umlal      v2.8h, v1.8b,  v5.8b           \n"
2174    "umlal2     v3.8h, v1.16b, v5.16b          \n"
2175    "rshrn      v0.8b,  v2.8h, #8              \n"
2176    "rshrn2     v0.16b, v3.8h, #8              \n"
2177    MEMACCESS(0)
2178    "st1        {v0.16b}, [%0], #16            \n"
2179    "b.gt       1b                             \n"
2180    "b          99f                            \n"
2181
2182    // Blend 50 / 50.
2183  "50:                                         \n"
2184    MEMACCESS(1)
2185    "ld1        {v0.16b}, [%1], #16            \n"
2186    MEMACCESS(2)
2187    "ld1        {v1.16b}, [%2], #16            \n"
2188    "subs       %w3, %w3, #16                  \n"
2189    "urhadd     v0.16b, v0.16b, v1.16b         \n"
2190    MEMACCESS(0)
2191    "st1        {v0.16b}, [%0], #16            \n"
2192    "b.gt       50b                            \n"
2193    "b          99f                            \n"
2194
2195    // Blend 100 / 0 - Copy row unchanged.
2196  "100:                                        \n"
2197    MEMACCESS(1)
2198    "ld1        {v0.16b}, [%1], #16            \n"
2199    "subs       %w3, %w3, #16                  \n"
2200    MEMACCESS(0)
2201    "st1        {v0.16b}, [%0], #16            \n"
2202    "b.gt       100b                           \n"
2203
2204  "99:                                         \n"
2205  : "+r"(dst_ptr),          // %0
2206    "+r"(src_ptr),          // %1
2207    "+r"(src_ptr1),         // %2
2208    "+r"(dst_width),        // %3
2209    "+r"(y1_fraction),      // %4
2210    "+r"(y0_fraction)       // %5
2211  :
2212  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2213  );
2214}
2215
2216// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2217void ARGBBlendRow_NEON(const uint8* src_argb0,
2218                       const uint8* src_argb1,
2219                       uint8* dst_argb,
2220                       int width) {
2221  asm volatile (
2222    "subs       %w3, %w3, #8                   \n"
2223    "b.lt       89f                            \n"
2224    // Blend 8 pixels.
2225  "8:                                          \n"
2226    MEMACCESS(0)
2227    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
2228    MEMACCESS(1)
2229    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
2230    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2231    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2232    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2233    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2234    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2235    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2236    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2237    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2238    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2239    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2240    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2241    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2242    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2243    "movi       v3.8b, #255                    \n"  // a = 255
2244    MEMACCESS(2)
2245    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2246    "b.ge       8b                             \n"
2247
2248  "89:                                         \n"
2249    "adds       %w3, %w3, #8-1                 \n"
2250    "b.lt       99f                            \n"
2251
2252    // Blend 1 pixels.
2253  "1:                                          \n"
2254    MEMACCESS(0)
2255    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
2256    MEMACCESS(1)
2257    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
2258    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
2259    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2260    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2261    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2262    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2263    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2264    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2265    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2266    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2267    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2268    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2269    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2270    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2271    "movi       v3.8b, #255                    \n"  // a = 255
2272    MEMACCESS(2)
2273    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
2274    "b.ge       1b                             \n"
2275
2276  "99:                                         \n"
2277
2278  : "+r"(src_argb0),    // %0
2279    "+r"(src_argb1),    // %1
2280    "+r"(dst_argb),     // %2
2281    "+r"(width)         // %3
2282  :
2283  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2284    "v16", "v17", "v18"
2285  );
2286}
2287
2288// Attenuate 8 pixels at a time.
2289void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2290  asm volatile (
2291    // Attenuate 8 pixels.
2292  "1:                                          \n"
2293    MEMACCESS(0)
2294    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
2295    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2296    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
2297    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
2298    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
2299    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
2300    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
2301    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
2302    MEMACCESS(1)
2303    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2304    "b.gt       1b                             \n"
2305  : "+r"(src_argb),   // %0
2306    "+r"(dst_argb),   // %1
2307    "+r"(width)       // %2
2308  :
2309  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2310  );
2311}
2312
2313// Quantize 8 ARGB pixels (32 bytes).
2314// dst = (dst * scale >> 16) * interval_size + interval_offset;
2315void ARGBQuantizeRow_NEON(uint8* dst_argb,
2316                          int scale,
2317                          int interval_size,
2318                          int interval_offset,
2319                          int width) {
2320  asm volatile (
2321    "dup        v4.8h, %w2                     \n"
2322    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
2323    "dup        v5.8h, %w3                     \n"  // interval multiply.
2324    "dup        v6.8h, %w4                     \n"  // interval add
2325
2326    // 8 pixel loop.
2327  "1:                                          \n"
2328    MEMACCESS(0)
2329    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
2330    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2331    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
2332    "uxtl       v1.8h, v1.8b                   \n"
2333    "uxtl       v2.8h, v2.8b                   \n"
2334    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
2335    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
2336    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
2337    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
2338    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
2339    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
2340    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
2341    "add        v1.8h, v1.8h, v6.8h            \n"  // g
2342    "add        v2.8h, v2.8h, v6.8h            \n"  // r
2343    "uqxtn      v0.8b, v0.8h                   \n"
2344    "uqxtn      v1.8b, v1.8h                   \n"
2345    "uqxtn      v2.8b, v2.8h                   \n"
2346    MEMACCESS(0)
2347    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
2348    "b.gt       1b                             \n"
2349  : "+r"(dst_argb),       // %0
2350    "+r"(width)           // %1
2351  : "r"(scale),           // %2
2352    "r"(interval_size),   // %3
2353    "r"(interval_offset)  // %4
2354  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2355  );
2356}
2357
2358// Shade 8 pixels at a time by specified value.
2359// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2360// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2361void ARGBShadeRow_NEON(const uint8* src_argb,
2362                       uint8* dst_argb,
2363                       int width,
2364                       uint32 value) {
2365  asm volatile (
2366    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
2367    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
2368    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
2369
2370    // 8 pixel loop.
2371  "1:                                          \n"
2372    MEMACCESS(0)
2373    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2374    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2375    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
2376    "uxtl       v5.8h, v5.8b                   \n"
2377    "uxtl       v6.8h, v6.8b                   \n"
2378    "uxtl       v7.8h, v7.8b                   \n"
2379    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
2380    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
2381    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
2382    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
2383    "uqxtn      v4.8b, v4.8h                   \n"
2384    "uqxtn      v5.8b, v5.8h                   \n"
2385    "uqxtn      v6.8b, v6.8h                   \n"
2386    "uqxtn      v7.8b, v7.8h                   \n"
2387    MEMACCESS(1)
2388    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2389    "b.gt       1b                             \n"
2390  : "+r"(src_argb),       // %0
2391    "+r"(dst_argb),       // %1
2392    "+r"(width)           // %2
2393  : "r"(value)            // %3
2394  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2395  );
2396}
2397
2398// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2399// Similar to ARGBToYJ but stores ARGB.
2400// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2401void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2402  asm volatile (
2403    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
2404    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
2405    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
2406  "1:                                          \n"
2407    MEMACCESS(0)
2408    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2409    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2410    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
2411    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
2412    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
2413    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
2414    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
2415    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
2416    MEMACCESS(1)
2417    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
2418    "b.gt       1b                             \n"
2419  : "+r"(src_argb),  // %0
2420    "+r"(dst_argb),  // %1
2421    "+r"(width)      // %2
2422  :
2423  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2424  );
2425}
2426
2427// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2428//    b = (r * 35 + g * 68 + b * 17) >> 7
2429//    g = (r * 45 + g * 88 + b * 22) >> 7
2430//    r = (r * 50 + g * 98 + b * 24) >> 7
2431
2432void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2433  asm volatile (
2434    "movi       v20.8b, #17                    \n"  // BB coefficient
2435    "movi       v21.8b, #68                    \n"  // BG coefficient
2436    "movi       v22.8b, #35                    \n"  // BR coefficient
2437    "movi       v24.8b, #22                    \n"  // GB coefficient
2438    "movi       v25.8b, #88                    \n"  // GG coefficient
2439    "movi       v26.8b, #45                    \n"  // GR coefficient
2440    "movi       v28.8b, #24                    \n"  // BB coefficient
2441    "movi       v29.8b, #98                    \n"  // BG coefficient
2442    "movi       v30.8b, #50                    \n"  // BR coefficient
2443  "1:                                          \n"
2444    MEMACCESS(0)
2445    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
2446    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2447    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
2448    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
2449    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
2450    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
2451    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
2452    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
2453    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
2454    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
2455    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
2456    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
2457    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
2458    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
2459    MEMACCESS(0)
2460    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
2461    "b.gt       1b                             \n"
2462  : "+r"(dst_argb),  // %0
2463    "+r"(width)      // %1
2464  :
2465  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2466    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2467  );
2468}
2469
2470// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2471// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2472// needs to saturate.  Consider doing a non-saturating version.
2473void ARGBColorMatrixRow_NEON(const uint8* src_argb,
2474                             uint8* dst_argb,
2475                             const int8* matrix_argb,
2476                             int width) {
2477  asm volatile (
2478    MEMACCESS(3)
2479    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
2480    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
2481    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
2482
2483  "1:                                          \n"
2484    MEMACCESS(0)
2485    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
2486    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2487    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
2488    "uxtl       v17.8h, v17.8b                 \n"  // g
2489    "uxtl       v18.8h, v18.8b                 \n"  // r
2490    "uxtl       v19.8h, v19.8b                 \n"  // a
2491    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
2492    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
2493    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
2494    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
2495    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
2496    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
2497    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
2498    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
2499    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2500    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2501    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2502    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2503    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
2504    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
2505    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
2506    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
2507    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2508    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2509    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2510    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2511    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
2512    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
2513    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
2514    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
2515    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2516    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2517    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2518    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2519    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
2520    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
2521    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
2522    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
2523    MEMACCESS(1)
2524    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
2525    "b.gt       1b                             \n"
2526  : "+r"(src_argb),   // %0
2527    "+r"(dst_argb),   // %1
2528    "+r"(width)       // %2
2529  : "r"(matrix_argb)  // %3
2530  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2531    "v18", "v19", "v22", "v23", "v24", "v25"
2532  );
2533}
2534
2535// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2536// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2537void ARGBMultiplyRow_NEON(const uint8* src_argb0,
2538                          const uint8* src_argb1,
2539                          uint8* dst_argb,
2540                          int width) {
2541  asm volatile (
2542    // 8 pixel loop.
2543  "1:                                          \n"
2544    MEMACCESS(0)
2545    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2546    MEMACCESS(1)
2547    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2548    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2549    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
2550    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
2551    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
2552    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
2553    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
2554    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
2555    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
2556    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
2557    MEMACCESS(2)
2558    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2559    "b.gt       1b                             \n"
2560
2561  : "+r"(src_argb0),  // %0
2562    "+r"(src_argb1),  // %1
2563    "+r"(dst_argb),   // %2
2564    "+r"(width)       // %3
2565  :
2566  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2567  );
2568}
2569
2570// Add 2 rows of ARGB pixels together, 8 pixels at a time.
2571void ARGBAddRow_NEON(const uint8* src_argb0,
2572                     const uint8* src_argb1,
2573                     uint8* dst_argb,
2574                     int width) {
2575  asm volatile (
2576    // 8 pixel loop.
2577  "1:                                          \n"
2578    MEMACCESS(0)
2579    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2580    MEMACCESS(1)
2581    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2582    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2583    "uqadd      v0.8b, v0.8b, v4.8b            \n"
2584    "uqadd      v1.8b, v1.8b, v5.8b            \n"
2585    "uqadd      v2.8b, v2.8b, v6.8b            \n"
2586    "uqadd      v3.8b, v3.8b, v7.8b            \n"
2587    MEMACCESS(2)
2588    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2589    "b.gt       1b                             \n"
2590
2591  : "+r"(src_argb0),  // %0
2592    "+r"(src_argb1),  // %1
2593    "+r"(dst_argb),   // %2
2594    "+r"(width)       // %3
2595  :
2596  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2597  );
2598}
2599
2600// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2601void ARGBSubtractRow_NEON(const uint8* src_argb0,
2602                          const uint8* src_argb1,
2603                          uint8* dst_argb,
2604                          int width) {
2605  asm volatile (
2606    // 8 pixel loop.
2607  "1:                                          \n"
2608    MEMACCESS(0)
2609    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2610    MEMACCESS(1)
2611    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2612    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2613    "uqsub      v0.8b, v0.8b, v4.8b            \n"
2614    "uqsub      v1.8b, v1.8b, v5.8b            \n"
2615    "uqsub      v2.8b, v2.8b, v6.8b            \n"
2616    "uqsub      v3.8b, v3.8b, v7.8b            \n"
2617    MEMACCESS(2)
2618    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2619    "b.gt       1b                             \n"
2620
2621  : "+r"(src_argb0),  // %0
2622    "+r"(src_argb1),  // %1
2623    "+r"(dst_argb),   // %2
2624    "+r"(width)       // %3
2625  :
2626  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2627  );
2628}
2629
2630// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2631// A = 255
2632// R = Sobel
2633// G = Sobel
2634// B = Sobel
2635void SobelRow_NEON(const uint8* src_sobelx,
2636                   const uint8* src_sobely,
2637                   uint8* dst_argb,
2638                   int width) {
2639  asm volatile (
2640    "movi       v3.8b, #255                    \n"  // alpha
2641    // 8 pixel loop.
2642  "1:                                          \n"
2643    MEMACCESS(0)
2644    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
2645    MEMACCESS(1)
2646    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
2647    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2648    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
2649    "orr        v1.8b, v0.8b, v0.8b            \n"
2650    "orr        v2.8b, v0.8b, v0.8b            \n"
2651    MEMACCESS(2)
2652    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2653    "b.gt       1b                             \n"
2654  : "+r"(src_sobelx),  // %0
2655    "+r"(src_sobely),  // %1
2656    "+r"(dst_argb),    // %2
2657    "+r"(width)        // %3
2658  :
2659  : "cc", "memory", "v0", "v1", "v2", "v3"
2660  );
2661}
2662
2663// Adds Sobel X and Sobel Y and stores Sobel into plane.
2664void SobelToPlaneRow_NEON(const uint8* src_sobelx,
2665                          const uint8* src_sobely,
2666                          uint8* dst_y,
2667                          int width) {
2668  asm volatile (
2669    // 16 pixel loop.
2670  "1:                                          \n"
2671    MEMACCESS(0)
2672    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
2673    MEMACCESS(1)
2674    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
2675    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
2676    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
2677    MEMACCESS(2)
2678    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
2679    "b.gt       1b                             \n"
2680  : "+r"(src_sobelx),  // %0
2681    "+r"(src_sobely),  // %1
2682    "+r"(dst_y),       // %2
2683    "+r"(width)        // %3
2684  :
2685  : "cc", "memory", "v0", "v1"
2686  );
2687}
2688
2689// Mixes Sobel X, Sobel Y and Sobel into ARGB.
2690// A = 255
2691// R = Sobel X
2692// G = Sobel
2693// B = Sobel Y
2694void SobelXYRow_NEON(const uint8* src_sobelx,
2695                     const uint8* src_sobely,
2696                     uint8* dst_argb,
2697                     int width) {
2698  asm volatile (
2699    "movi       v3.8b, #255                    \n"  // alpha
2700    // 8 pixel loop.
2701  "1:                                          \n"
2702    MEMACCESS(0)
2703    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
2704    MEMACCESS(1)
2705    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
2706    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2707    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
2708    MEMACCESS(2)
2709    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2710    "b.gt       1b                             \n"
2711  : "+r"(src_sobelx),  // %0
2712    "+r"(src_sobely),  // %1
2713    "+r"(dst_argb),    // %2
2714    "+r"(width)        // %3
2715  :
2716  : "cc", "memory", "v0", "v1", "v2", "v3"
2717  );
2718}
2719
2720// SobelX as a matrix is
2721// -1  0  1
2722// -2  0  2
2723// -1  0  1
2724void SobelXRow_NEON(const uint8* src_y0,
2725                    const uint8* src_y1,
2726                    const uint8* src_y2,
2727                    uint8* dst_sobelx,
2728                    int width) {
2729  asm volatile (
2730  "1:                                          \n"
2731    MEMACCESS(0)
2732    "ld1        {v0.8b}, [%0],%5               \n"  // top
2733    MEMACCESS(0)
2734    "ld1        {v1.8b}, [%0],%6               \n"
2735    "usubl      v0.8h, v0.8b, v1.8b            \n"
2736    MEMACCESS(1)
2737    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
2738    MEMACCESS(1)
2739    "ld1        {v3.8b}, [%1],%6               \n"
2740    "usubl      v1.8h, v2.8b, v3.8b            \n"
2741    "add        v0.8h, v0.8h, v1.8h            \n"
2742    "add        v0.8h, v0.8h, v1.8h            \n"
2743    MEMACCESS(2)
2744    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
2745    MEMACCESS(2)
2746    "ld1        {v3.8b}, [%2],%6               \n"
2747    "subs       %w4, %w4, #8                   \n"  // 8 pixels
2748    "usubl      v1.8h, v2.8b, v3.8b            \n"
2749    "add        v0.8h, v0.8h, v1.8h            \n"
2750    "abs        v0.8h, v0.8h                   \n"
2751    "uqxtn      v0.8b, v0.8h                   \n"
2752    MEMACCESS(3)
2753    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
2754    "b.gt       1b                             \n"
2755  : "+r"(src_y0),      // %0
2756    "+r"(src_y1),      // %1
2757    "+r"(src_y2),      // %2
2758    "+r"(dst_sobelx),  // %3
2759    "+r"(width)        // %4
2760  : "r"(2LL),          // %5
2761    "r"(6LL)           // %6
2762  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2763  );
2764}
2765
2766// SobelY as a matrix is
2767// -1 -2 -1
2768//  0  0  0
2769//  1  2  1
2770void SobelYRow_NEON(const uint8* src_y0,
2771                    const uint8* src_y1,
2772                    uint8* dst_sobely,
2773                    int width) {
2774  asm volatile (
2775  "1:                                          \n"
2776    MEMACCESS(0)
2777    "ld1        {v0.8b}, [%0],%4               \n"  // left
2778    MEMACCESS(1)
2779    "ld1        {v1.8b}, [%1],%4               \n"
2780    "usubl      v0.8h, v0.8b, v1.8b            \n"
2781    MEMACCESS(0)
2782    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
2783    MEMACCESS(1)
2784    "ld1        {v3.8b}, [%1],%4               \n"
2785    "usubl      v1.8h, v2.8b, v3.8b            \n"
2786    "add        v0.8h, v0.8h, v1.8h            \n"
2787    "add        v0.8h, v0.8h, v1.8h            \n"
2788    MEMACCESS(0)
2789    "ld1        {v2.8b}, [%0],%5               \n"  // right
2790    MEMACCESS(1)
2791    "ld1        {v3.8b}, [%1],%5               \n"
2792    "subs       %w3, %w3, #8                   \n"  // 8 pixels
2793    "usubl      v1.8h, v2.8b, v3.8b            \n"
2794    "add        v0.8h, v0.8h, v1.8h            \n"
2795    "abs        v0.8h, v0.8h                   \n"
2796    "uqxtn      v0.8b, v0.8h                   \n"
2797    MEMACCESS(2)
2798    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
2799    "b.gt       1b                             \n"
2800  : "+r"(src_y0),      // %0
2801    "+r"(src_y1),      // %1
2802    "+r"(dst_sobely),  // %2
2803    "+r"(width)        // %3
2804  : "r"(1LL),          // %4
2805    "r"(6LL)           // %5
2806  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2807  );
2808}
2809
2810// Caveat - rounds float to half float whereas scaling version truncates.
2811void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
2812  asm volatile (
2813  "1:                                          \n"
2814    MEMACCESS(0)
2815    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
2816    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
2817    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
2818    "uxtl2      v3.4s, v1.8h                   \n"
2819    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
2820    "scvtf      v3.4s, v3.4s                   \n"
2821    "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
2822    "fcvtn2     v1.8h, v3.4s                   \n"
2823   MEMACCESS(1)
2824    "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
2825    "b.gt       1b                             \n"
2826  : "+r"(src),    // %0
2827    "+r"(dst),    // %1
2828    "+r"(width)   // %2
2829  :
2830  : "cc", "memory", "v1", "v2", "v3"
2831  );
2832}
2833
2834void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
2835  asm volatile (
2836  "1:                                          \n"
2837    MEMACCESS(0)
2838    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
2839    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
2840    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
2841    "uxtl2      v3.4s, v1.8h                   \n"
2842    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
2843    "scvtf      v3.4s, v3.4s                   \n"
2844    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
2845    "fmul       v3.4s, v3.4s, %3.s[0]          \n"
2846    "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
2847    "uqshrn2    v1.8h, v3.4s, #13              \n"
2848   MEMACCESS(1)
2849    "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
2850    "b.gt       1b                             \n"
2851  : "+r"(src),    // %0
2852    "+r"(dst),    // %1
2853    "+r"(width)   // %2
2854  : "w"(scale * 1.9259299444e-34f)    // %3
2855  : "cc", "memory", "v1", "v2", "v3"
2856  );
2857}
2858
2859#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
2860
2861#ifdef __cplusplus
2862}  // extern "C"
2863}  // namespace libyuv
2864#endif
2865