1/*
2 *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC Neon armv8 64 bit.
19#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21// Read 8 Y, 4 U and 4 V from 422
22#define READYUV422                                                             \
23    MEMACCESS(0)                                                               \
24    "ld1        {v0.8b}, [%0], #8              \n"                             \
25    MEMACCESS(1)                                                               \
26    "ld1        {v1.s}[0], [%1], #4            \n"                             \
27    MEMACCESS(2)                                                               \
28    "ld1        {v1.s}[1], [%2], #4            \n"
29
30// Read 8 Y, 2 U and 2 V from 422
31#define READYUV411                                                             \
32    MEMACCESS(0)                                                               \
33    "ld1        {v0.8b}, [%0], #8              \n"                             \
34    MEMACCESS(1)                                                               \
35    "ld1        {v2.h}[0], [%1], #2            \n"                             \
36    MEMACCESS(2)                                                               \
37    "ld1        {v2.h}[1], [%2], #2            \n"                             \
38    "zip1       v1.8b, v2.8b, v2.8b            \n"
39
40// Read 8 Y, 8 U and 8 V from 444
41#define READYUV444                                                             \
42    MEMACCESS(0)                                                               \
43    "ld1        {v0.8b}, [%0], #8              \n"                             \
44    MEMACCESS(1)                                                               \
45    "ld1        {v1.d}[0], [%1], #8            \n"                             \
46    MEMACCESS(2)                                                               \
47    "ld1        {v1.d}[1], [%2], #8            \n"                             \
48    "uaddlp     v1.8h, v1.16b                  \n"                             \
49    "rshrn      v1.8b, v1.8h, #1               \n"
50
51// Read 8 Y, and set 4 U and 4 V to 128
52#define READYUV400                                                             \
53    MEMACCESS(0)                                                               \
54    "ld1        {v0.8b}, [%0], #8              \n"                             \
55    "movi       v1.8b , #128                   \n"
56
57// Read 8 Y and 4 UV from NV12
58#define READNV12                                                               \
59    MEMACCESS(0)                                                               \
60    "ld1        {v0.8b}, [%0], #8              \n"                             \
61    MEMACCESS(1)                                                               \
62    "ld1        {v2.8b}, [%1], #8              \n"                             \
63    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
64    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
65    "ins        v1.s[1], v3.s[0]               \n"
66
67// Read 8 Y and 4 VU from NV21
68#define READNV21                                                               \
69    MEMACCESS(0)                                                               \
70    "ld1        {v0.8b}, [%0], #8              \n"                             \
71    MEMACCESS(1)                                                               \
72    "ld1        {v2.8b}, [%1], #8              \n"                             \
73    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
74    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
75    "ins        v1.s[1], v3.s[0]               \n"
76
77// Read 8 YUY2
78#define READYUY2                                                               \
79    MEMACCESS(0)                                                               \
80    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
81    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
82    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
83    "ins        v1.s[1], v3.s[0]               \n"
84
85// Read 8 UYVY
86#define READUYVY                                                               \
87    MEMACCESS(0)                                                               \
88    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
89    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
90    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
91    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
92    "ins        v1.s[1], v3.s[0]               \n"
93
94#define YUVTORGB_SETUP                                                         \
95    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
96    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
97    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
98    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
99    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \
100    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
101
102#define YUVTORGB(vR, vG, vB)                                                   \
103    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
104    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
105    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
106    "ushll      v0.4s, v0.4h, #0               \n"                             \
107    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
108    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
109    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
110    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
111    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
112    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
113    "uxtl       v2.8h, v2.8b                   \n"                             \
114    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
115    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
116    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
117    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
118    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
119    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
120    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
121    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
122    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
123    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
124    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
125    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
126    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
127    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
128    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
129
130void I444ToARGBRow_NEON(const uint8* src_y,
131                        const uint8* src_u,
132                        const uint8* src_v,
133                        uint8* dst_argb,
134                        const struct YuvConstants* yuvconstants,
135                        int width) {
136  asm volatile (
137    YUVTORGB_SETUP
138    "movi       v23.8b, #255                   \n" /* A */
139  "1:                                          \n"
140    READYUV444
141    YUVTORGB(v22, v21, v20)
142    "subs       %w4, %w4, #8                   \n"
143    MEMACCESS(3)
144    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
145    "b.gt       1b                             \n"
146    : "+r"(src_y),     // %0
147      "+r"(src_u),     // %1
148      "+r"(src_v),     // %2
149      "+r"(dst_argb),  // %3
150      "+r"(width)      // %4
151    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
152      [kUVToG]"r"(&yuvconstants->kUVToG),
153      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
154      [kYToRgb]"r"(&yuvconstants->kYToRgb)
155    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
156      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
157  );
158}
159
160void I422ToARGBRow_NEON(const uint8* src_y,
161                        const uint8* src_u,
162                        const uint8* src_v,
163                        uint8* dst_argb,
164                        const struct YuvConstants* yuvconstants,
165                        int width) {
166  asm volatile (
167    YUVTORGB_SETUP
168    "movi       v23.8b, #255                   \n" /* A */
169  "1:                                          \n"
170    READYUV422
171    YUVTORGB(v22, v21, v20)
172    "subs       %w4, %w4, #8                   \n"
173    MEMACCESS(3)
174    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
175    "b.gt       1b                             \n"
176    : "+r"(src_y),     // %0
177      "+r"(src_u),     // %1
178      "+r"(src_v),     // %2
179      "+r"(dst_argb),  // %3
180      "+r"(width)      // %4
181    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
182      [kUVToG]"r"(&yuvconstants->kUVToG),
183      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
184      [kYToRgb]"r"(&yuvconstants->kYToRgb)
185    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
186      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
187  );
188}
189
190void I422AlphaToARGBRow_NEON(const uint8* src_y,
191                             const uint8* src_u,
192                             const uint8* src_v,
193                             const uint8* src_a,
194                             uint8* dst_argb,
195                             const struct YuvConstants* yuvconstants,
196                             int width) {
197  asm volatile (
198    YUVTORGB_SETUP
199  "1:                                          \n"
200    READYUV422
201    YUVTORGB(v22, v21, v20)
202    MEMACCESS(3)
203    "ld1        {v23.8b}, [%3], #8             \n"
204    "subs       %w5, %w5, #8                   \n"
205    MEMACCESS(4)
206    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
207    "b.gt       1b                             \n"
208    : "+r"(src_y),     // %0
209      "+r"(src_u),     // %1
210      "+r"(src_v),     // %2
211      "+r"(src_a),     // %3
212      "+r"(dst_argb),  // %4
213      "+r"(width)      // %5
214    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
215      [kUVToG]"r"(&yuvconstants->kUVToG),
216      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
217      [kYToRgb]"r"(&yuvconstants->kYToRgb)
218    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
219      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
220  );
221}
222
223void I411ToARGBRow_NEON(const uint8* src_y,
224                        const uint8* src_u,
225                        const uint8* src_v,
226                        uint8* dst_argb,
227                        const struct YuvConstants* yuvconstants,
228                        int width) {
229  asm volatile (
230    YUVTORGB_SETUP
231    "movi       v23.8b, #255                   \n" /* A */
232  "1:                                          \n"
233    READYUV411
234    YUVTORGB(v22, v21, v20)
235    "subs       %w4, %w4, #8                   \n"
236    MEMACCESS(3)
237    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
238    "b.gt       1b                             \n"
239    : "+r"(src_y),     // %0
240      "+r"(src_u),     // %1
241      "+r"(src_v),     // %2
242      "+r"(dst_argb),  // %3
243      "+r"(width)      // %4
244    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
245      [kUVToG]"r"(&yuvconstants->kUVToG),
246      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
247      [kYToRgb]"r"(&yuvconstants->kYToRgb)
248    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
249      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
250  );
251}
252
253void I422ToRGBARow_NEON(const uint8* src_y,
254                        const uint8* src_u,
255                        const uint8* src_v,
256                        uint8* dst_rgba,
257                        const struct YuvConstants* yuvconstants,
258                        int width) {
259  asm volatile (
260    YUVTORGB_SETUP
261    "movi       v20.8b, #255                   \n" /* A */
262  "1:                                          \n"
263    READYUV422
264    YUVTORGB(v23, v22, v21)
265    "subs       %w4, %w4, #8                   \n"
266    MEMACCESS(3)
267    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
268    "b.gt       1b                             \n"
269    : "+r"(src_y),     // %0
270      "+r"(src_u),     // %1
271      "+r"(src_v),     // %2
272      "+r"(dst_rgba),  // %3
273      "+r"(width)      // %4
274    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
275      [kUVToG]"r"(&yuvconstants->kUVToG),
276      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
277      [kYToRgb]"r"(&yuvconstants->kYToRgb)
278    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
279      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
280  );
281}
282
283void I422ToRGB24Row_NEON(const uint8* src_y,
284                         const uint8* src_u,
285                         const uint8* src_v,
286                         uint8* dst_rgb24,
287                         const struct YuvConstants* yuvconstants,
288                         int width) {
289  asm volatile (
290    YUVTORGB_SETUP
291  "1:                                          \n"
292    READYUV422
293    YUVTORGB(v22, v21, v20)
294    "subs       %w4, %w4, #8                   \n"
295    MEMACCESS(3)
296    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
297    "b.gt       1b                             \n"
298    : "+r"(src_y),     // %0
299      "+r"(src_u),     // %1
300      "+r"(src_v),     // %2
301      "+r"(dst_rgb24), // %3
302      "+r"(width)      // %4
303    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
304      [kUVToG]"r"(&yuvconstants->kUVToG),
305      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
306      [kYToRgb]"r"(&yuvconstants->kYToRgb)
307    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
308      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
309  );
310}
311
312#define ARGBTORGB565                                                           \
313    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
314    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
315    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
316    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
317    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
318
319void I422ToRGB565Row_NEON(const uint8* src_y,
320                          const uint8* src_u,
321                          const uint8* src_v,
322                          uint8* dst_rgb565,
323                          const struct YuvConstants* yuvconstants,
324                          int width) {
325  asm volatile (
326    YUVTORGB_SETUP
327  "1:                                          \n"
328    READYUV422
329    YUVTORGB(v22, v21, v20)
330    "subs       %w4, %w4, #8                   \n"
331    ARGBTORGB565
332    MEMACCESS(3)
333    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
334    "b.gt       1b                             \n"
335    : "+r"(src_y),    // %0
336      "+r"(src_u),    // %1
337      "+r"(src_v),    // %2
338      "+r"(dst_rgb565),  // %3
339      "+r"(width)     // %4
340    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
341      [kUVToG]"r"(&yuvconstants->kUVToG),
342      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
343      [kYToRgb]"r"(&yuvconstants->kYToRgb)
344    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
345      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
346  );
347}
348
349#define ARGBTOARGB1555                                                         \
350    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
351    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
352    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
353    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
354    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
355    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
356    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
357
358void I422ToARGB1555Row_NEON(const uint8* src_y,
359                            const uint8* src_u,
360                            const uint8* src_v,
361                            uint8* dst_argb1555,
362                            const struct YuvConstants* yuvconstants,
363                            int width) {
364  asm volatile (
365    YUVTORGB_SETUP
366    "movi       v23.8b, #255                   \n"
367  "1:                                          \n"
368    READYUV422
369    YUVTORGB(v22, v21, v20)
370    "subs       %w4, %w4, #8                   \n"
371    ARGBTOARGB1555
372    MEMACCESS(3)
373    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
374    "b.gt       1b                             \n"
375    : "+r"(src_y),    // %0
376      "+r"(src_u),    // %1
377      "+r"(src_v),    // %2
378      "+r"(dst_argb1555),  // %3
379      "+r"(width)     // %4
380    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
381      [kUVToG]"r"(&yuvconstants->kUVToG),
382      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
383      [kYToRgb]"r"(&yuvconstants->kYToRgb)
384    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
385      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
386  );
387}
388
389#define ARGBTOARGB4444                                                         \
390    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
391    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
392    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
393    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
394    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
395    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
396    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
397    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
398
399void I422ToARGB4444Row_NEON(const uint8* src_y,
400                            const uint8* src_u,
401                            const uint8* src_v,
402                            uint8* dst_argb4444,
403                            const struct YuvConstants* yuvconstants,
404                            int width) {
405  asm volatile (
406    YUVTORGB_SETUP
407    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
408  "1:                                          \n"
409    READYUV422
410    YUVTORGB(v22, v21, v20)
411    "subs       %w4, %w4, #8                   \n"
412    "movi       v23.8b, #255                   \n"
413    ARGBTOARGB4444
414    MEMACCESS(3)
415    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
416    "b.gt       1b                             \n"
417    : "+r"(src_y),    // %0
418      "+r"(src_u),    // %1
419      "+r"(src_v),    // %2
420      "+r"(dst_argb4444),  // %3
421      "+r"(width)     // %4
422    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
423      [kUVToG]"r"(&yuvconstants->kUVToG),
424      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
425      [kYToRgb]"r"(&yuvconstants->kYToRgb)
426    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
427      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
428  );
429}
430
431void I400ToARGBRow_NEON(const uint8* src_y,
432                        uint8* dst_argb,
433                        int width) {
434  asm volatile (
435    YUVTORGB_SETUP
436    "movi       v23.8b, #255                   \n"
437  "1:                                          \n"
438    READYUV400
439    YUVTORGB(v22, v21, v20)
440    "subs       %w2, %w2, #8                   \n"
441    MEMACCESS(1)
442    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
443    "b.gt       1b                             \n"
444    : "+r"(src_y),     // %0
445      "+r"(dst_argb),  // %1
446      "+r"(width)      // %2
447    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
448      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
449      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
450      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
451    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
452      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
453  );
454}
455
456void J400ToARGBRow_NEON(const uint8* src_y,
457                        uint8* dst_argb,
458                        int width) {
459  asm volatile (
460    "movi       v23.8b, #255                   \n"
461  "1:                                          \n"
462    MEMACCESS(0)
463    "ld1        {v20.8b}, [%0], #8             \n"
464    "orr        v21.8b, v20.8b, v20.8b         \n"
465    "orr        v22.8b, v20.8b, v20.8b         \n"
466    "subs       %w2, %w2, #8                   \n"
467    MEMACCESS(1)
468    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
469    "b.gt       1b                             \n"
470    : "+r"(src_y),     // %0
471      "+r"(dst_argb),  // %1
472      "+r"(width)      // %2
473    :
474    : "cc", "memory", "v20", "v21", "v22", "v23"
475  );
476}
477
478void NV12ToARGBRow_NEON(const uint8* src_y,
479                        const uint8* src_uv,
480                        uint8* dst_argb,
481                        const struct YuvConstants* yuvconstants,
482                        int width) {
483  asm volatile (
484    YUVTORGB_SETUP
485    "movi       v23.8b, #255                   \n"
486  "1:                                          \n"
487    READNV12
488    YUVTORGB(v22, v21, v20)
489    "subs       %w3, %w3, #8                   \n"
490    MEMACCESS(2)
491    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
492    "b.gt       1b                             \n"
493    : "+r"(src_y),     // %0
494      "+r"(src_uv),    // %1
495      "+r"(dst_argb),  // %2
496      "+r"(width)      // %3
497    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
498      [kUVToG]"r"(&yuvconstants->kUVToG),
499      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
500      [kYToRgb]"r"(&yuvconstants->kYToRgb)
501    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
502      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
503  );
504}
505
506void NV21ToARGBRow_NEON(const uint8* src_y,
507                        const uint8* src_vu,
508                        uint8* dst_argb,
509                        const struct YuvConstants* yuvconstants,
510                        int width) {
511  asm volatile (
512    YUVTORGB_SETUP
513    "movi       v23.8b, #255                   \n"
514  "1:                                          \n"
515    READNV21
516    YUVTORGB(v22, v21, v20)
517    "subs       %w3, %w3, #8                   \n"
518    MEMACCESS(2)
519    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
520    "b.gt       1b                             \n"
521    : "+r"(src_y),     // %0
522      "+r"(src_vu),    // %1
523      "+r"(dst_argb),  // %2
524      "+r"(width)      // %3
525    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
526      [kUVToG]"r"(&yuvconstants->kUVToG),
527      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
528      [kYToRgb]"r"(&yuvconstants->kYToRgb)
529    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
530      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
531  );
532}
533
534void NV12ToRGB565Row_NEON(const uint8* src_y,
535                          const uint8* src_uv,
536                          uint8* dst_rgb565,
537                          const struct YuvConstants* yuvconstants,
538                          int width) {
539  asm volatile (
540    YUVTORGB_SETUP
541  "1:                                          \n"
542    READNV12
543    YUVTORGB(v22, v21, v20)
544    "subs       %w3, %w3, #8                   \n"
545    ARGBTORGB565
546    MEMACCESS(2)
547    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
548    "b.gt       1b                             \n"
549    : "+r"(src_y),     // %0
550      "+r"(src_uv),    // %1
551      "+r"(dst_rgb565),  // %2
552      "+r"(width)      // %3
553    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
554      [kUVToG]"r"(&yuvconstants->kUVToG),
555      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
556      [kYToRgb]"r"(&yuvconstants->kYToRgb)
557    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
558      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
559  );
560}
561
562void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
563                        uint8* dst_argb,
564                        const struct YuvConstants* yuvconstants,
565                        int width) {
566  asm volatile (
567    YUVTORGB_SETUP
568    "movi       v23.8b, #255                   \n"
569  "1:                                          \n"
570    READYUY2
571    YUVTORGB(v22, v21, v20)
572    "subs       %w2, %w2, #8                   \n"
573    MEMACCESS(1)
574    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
575    "b.gt       1b                             \n"
576    : "+r"(src_yuy2),  // %0
577      "+r"(dst_argb),  // %1
578      "+r"(width)      // %2
579    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
580      [kUVToG]"r"(&yuvconstants->kUVToG),
581      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
582      [kYToRgb]"r"(&yuvconstants->kYToRgb)
583    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
584      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
585  );
586}
587
588void UYVYToARGBRow_NEON(const uint8* src_uyvy,
589                        uint8* dst_argb,
590                        const struct YuvConstants* yuvconstants,
591                        int width) {
592  asm volatile (
593    YUVTORGB_SETUP
594    "movi       v23.8b, #255                   \n"
595  "1:                                          \n"
596    READUYVY
597    YUVTORGB(v22, v21, v20)
598    "subs       %w2, %w2, #8                   \n"
599    MEMACCESS(1)
600    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
601    "b.gt       1b                             \n"
602    : "+r"(src_uyvy),  // %0
603      "+r"(dst_argb),  // %1
604      "+r"(width)      // %2
605    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
606      [kUVToG]"r"(&yuvconstants->kUVToG),
607      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
608      [kYToRgb]"r"(&yuvconstants->kYToRgb)
609    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
610      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
611  );
612}
613
614// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
615void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
616                     int width) {
617  asm volatile (
618  "1:                                          \n"
619    MEMACCESS(0)
620    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
621    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
622    MEMACCESS(1)
623    "st1        {v0.16b}, [%1], #16            \n"  // store U
624    MEMACCESS(2)
625    "st1        {v1.16b}, [%2], #16            \n"  // store V
626    "b.gt       1b                             \n"
627    : "+r"(src_uv),  // %0
628      "+r"(dst_u),   // %1
629      "+r"(dst_v),   // %2
630      "+r"(width)    // %3  // Output registers
631    :                       // Input registers
632    : "cc", "memory", "v0", "v1"  // Clobber List
633  );
634}
635
636// Reads 16 U's and V's and writes out 16 pairs of UV.
637void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
638                     int width) {
639  asm volatile (
640  "1:                                          \n"
641    MEMACCESS(0)
642    "ld1        {v0.16b}, [%0], #16            \n"  // load U
643    MEMACCESS(1)
644    "ld1        {v1.16b}, [%1], #16            \n"  // load V
645    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
646    MEMACCESS(2)
647    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
648    "b.gt       1b                             \n"
649    :
650      "+r"(src_u),   // %0
651      "+r"(src_v),   // %1
652      "+r"(dst_uv),  // %2
653      "+r"(width)    // %3  // Output registers
654    :                       // Input registers
655    : "cc", "memory", "v0", "v1"  // Clobber List
656  );
657}
658
659// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
660void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
661  asm volatile (
662  "1:                                          \n"
663    MEMACCESS(0)
664    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
665    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
666    MEMACCESS(1)
667    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
668    "b.gt       1b                             \n"
669  : "+r"(src),   // %0
670    "+r"(dst),   // %1
671    "+r"(count)  // %2  // Output registers
672  :                     // Input registers
673  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
674  );
675}
676
677// SetRow writes 'count' bytes using an 8 bit value repeated.
678void SetRow_NEON(uint8* dst, uint8 v8, int count) {
679  asm volatile (
680    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
681  "1:                                          \n"
682    "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
683    MEMACCESS(0)
684    "st1        {v0.16b}, [%0], #16            \n"  // store
685    "b.gt       1b                             \n"
686  : "+r"(dst),   // %0
687    "+r"(count)  // %1
688  : "r"(v8)      // %2
689  : "cc", "memory", "v0"
690  );
691}
692
693void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
694  asm volatile (
695    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
696  "1:                                          \n"
697    "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
698    MEMACCESS(0)
699    "st1        {v0.16b}, [%0], #16            \n"  // store
700    "b.gt       1b                             \n"
701  : "+r"(dst),   // %0
702    "+r"(count)  // %1
703  : "r"(v32)     // %2
704  : "cc", "memory", "v0"
705  );
706}
707
708void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
709  asm volatile (
710    // Start at end of source row.
711    "add        %0, %0, %w2, sxtw              \n"
712    "sub        %0, %0, #16                    \n"
713  "1:                                          \n"
714    MEMACCESS(0)
715    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
716    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
717    "rev64      v0.16b, v0.16b                 \n"
718    MEMACCESS(1)
719    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
720    MEMACCESS(1)
721    "st1        {v0.D}[0], [%1], #8            \n"
722    "b.gt       1b                             \n"
723  : "+r"(src),   // %0
724    "+r"(dst),   // %1
725    "+r"(width)  // %2
726  : "r"((ptrdiff_t)-16)    // %3
727  : "cc", "memory", "v0"
728  );
729}
730
731void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
732                      int width) {
733  asm volatile (
734    // Start at end of source row.
735    "add        %0, %0, %w3, sxtw #1           \n"
736    "sub        %0, %0, #16                    \n"
737  "1:                                          \n"
738    MEMACCESS(0)
739    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
740    "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
741    "rev64      v0.8b, v0.8b                   \n"
742    "rev64      v1.8b, v1.8b                   \n"
743    MEMACCESS(1)
744    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
745    MEMACCESS(2)
746    "st1        {v1.8b}, [%2], #8              \n"
747    "b.gt       1b                             \n"
748  : "+r"(src_uv),  // %0
749    "+r"(dst_u),   // %1
750    "+r"(dst_v),   // %2
751    "+r"(width)    // %3
752  : "r"((ptrdiff_t)-16)      // %4
753  : "cc", "memory", "v0", "v1"
754  );
755}
756
757void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
758  asm volatile (
759  // Start at end of source row.
760    "add        %0, %0, %w2, sxtw #2           \n"
761    "sub        %0, %0, #16                    \n"
762  "1:                                          \n"
763    MEMACCESS(0)
764    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
765    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
766    "rev64      v0.4s, v0.4s                   \n"
767    MEMACCESS(1)
768    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
769    MEMACCESS(1)
770    "st1        {v0.D}[0], [%1], #8            \n"
771    "b.gt       1b                             \n"
772  : "+r"(src),   // %0
773    "+r"(dst),   // %1
774    "+r"(width)  // %2
775  : "r"((ptrdiff_t)-16)    // %3
776  : "cc", "memory", "v0"
777  );
778}
779
780void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
781  asm volatile (
782    "movi       v4.8b, #255                    \n"  // Alpha
783  "1:                                          \n"
784    MEMACCESS(0)
785    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
786    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
787    MEMACCESS(1)
788    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
789    "b.gt       1b                             \n"
790  : "+r"(src_rgb24),  // %0
791    "+r"(dst_argb),   // %1
792    "+r"(width)       // %2
793  :
794  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
795  );
796}
797
798void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
799  asm volatile (
800    "movi       v5.8b, #255                    \n"  // Alpha
801  "1:                                          \n"
802    MEMACCESS(0)
803    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
804    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
805    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
806    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
807    MEMACCESS(1)
808    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
809    "b.gt       1b                             \n"
810  : "+r"(src_raw),   // %0
811    "+r"(dst_argb),  // %1
812    "+r"(width)      // %2
813  :
814  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
815  );
816}
817
818void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
819  asm volatile (
820  "1:                                          \n"
821    MEMACCESS(0)
822    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
823    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
824    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
825    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
826    MEMACCESS(1)
827    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
828    "b.gt       1b                             \n"
829  : "+r"(src_raw),    // %0
830    "+r"(dst_rgb24),  // %1
831    "+r"(width)       // %2
832  :
833  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
834  );
835}
836
837#define RGB565TOARGB                                                           \
838    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
839    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
840    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
841    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
842    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
843    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
844    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
845    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
846    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
847    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
848    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
849
850void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
851  asm volatile (
852    "movi       v3.8b, #255                    \n"  // Alpha
853  "1:                                          \n"
854    MEMACCESS(0)
855    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
856    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
857    RGB565TOARGB
858    MEMACCESS(1)
859    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
860    "b.gt       1b                             \n"
861  : "+r"(src_rgb565),  // %0
862    "+r"(dst_argb),    // %1
863    "+r"(width)          // %2
864  :
865  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
866  );
867}
868
869#define ARGB1555TOARGB                                                         \
870    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
871    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
872    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
873                                                                               \
874    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
875    "xtn2       v3.16b, v2.8h                  \n"                             \
876                                                                               \
877    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
878    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
879                                                                               \
880    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
881    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
882    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
883                                                                               \
884    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
885    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
886    "dup        v1.2D, v0.D[1]                 \n"                             \
887    "dup        v3.2D, v2.D[1]                 \n"
888
889// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
890#define RGB555TOARGB                                                           \
891    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
892    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
893    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
894                                                                               \
895    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
896    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
897                                                                               \
898    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
899    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
900    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
901                                                                               \
902    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
903    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
904    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
905
906void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
907                            int width) {
908  asm volatile (
909    "movi       v3.8b, #255                    \n"  // Alpha
910  "1:                                          \n"
911    MEMACCESS(0)
912    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
913    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
914    ARGB1555TOARGB
915    MEMACCESS(1)
916    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
917    "b.gt       1b                             \n"
918  : "+r"(src_argb1555),  // %0
919    "+r"(dst_argb),    // %1
920    "+r"(width)          // %2
921  :
922  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
923  );
924}
925
926#define ARGB4444TOARGB                                                         \
927    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
928    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
929    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
930    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
931    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
932    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
933    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
934    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
935    "dup        v0.2D, v2.D[1]                 \n"                             \
936    "dup        v1.2D, v3.D[1]                 \n"
937
938void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
939                            int width) {
940  asm volatile (
941  "1:                                          \n"
942    MEMACCESS(0)
943    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
944    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
945    ARGB4444TOARGB
946    MEMACCESS(1)
947    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
948    "b.gt       1b                             \n"
949  : "+r"(src_argb4444),  // %0
950    "+r"(dst_argb),    // %1
951    "+r"(width)          // %2
952  :
953  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
954  );
955}
956
957void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
958  asm volatile (
959  "1:                                          \n"
960    MEMACCESS(0)
961    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
962    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
963    MEMACCESS(1)
964    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
965    "b.gt       1b                             \n"
966  : "+r"(src_argb),   // %0
967    "+r"(dst_rgb24),  // %1
968    "+r"(width)         // %2
969  :
970  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
971  );
972}
973
974void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
975  asm volatile (
976  "1:                                          \n"
977    MEMACCESS(0)
978    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
979    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
980    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
981    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
982    MEMACCESS(1)
983    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
984    "b.gt       1b                             \n"
985  : "+r"(src_argb),  // %0
986    "+r"(dst_raw),   // %1
987    "+r"(width)        // %2
988  :
989  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
990  );
991}
992
993void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
994  asm volatile (
995  "1:                                          \n"
996    MEMACCESS(0)
997    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
998    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
999    MEMACCESS(1)
1000    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1001    "b.gt       1b                             \n"
1002  : "+r"(src_yuy2),  // %0
1003    "+r"(dst_y),     // %1
1004    "+r"(width)        // %2
1005  :
1006  : "cc", "memory", "v0", "v1"  // Clobber List
1007  );
1008}
1009
1010void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
1011  asm volatile (
1012  "1:                                          \n"
1013    MEMACCESS(0)
1014    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
1015    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
1016    MEMACCESS(1)
1017    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1018    "b.gt       1b                             \n"
1019  : "+r"(src_uyvy),  // %0
1020    "+r"(dst_y),     // %1
1021    "+r"(width)        // %2
1022  :
1023  : "cc", "memory", "v0", "v1"  // Clobber List
1024  );
1025}
1026
1027void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1028                         int width) {
1029  asm volatile (
1030  "1:                                          \n"
1031    MEMACCESS(0)
1032    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
1033    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1034    MEMACCESS(1)
1035    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
1036    MEMACCESS(2)
1037    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
1038    "b.gt       1b                             \n"
1039  : "+r"(src_yuy2),  // %0
1040    "+r"(dst_u),     // %1
1041    "+r"(dst_v),     // %2
1042    "+r"(width)        // %3
1043  :
1044  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1045  );
1046}
1047
1048void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1049                         int width) {
1050  asm volatile (
1051  "1:                                          \n"
1052    MEMACCESS(0)
1053    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
1054    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1055    MEMACCESS(1)
1056    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
1057    MEMACCESS(2)
1058    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
1059    "b.gt       1b                             \n"
1060  : "+r"(src_uyvy),  // %0
1061    "+r"(dst_u),     // %1
1062    "+r"(dst_v),     // %2
1063    "+r"(width)        // %3
1064  :
1065  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1066  );
1067}
1068
1069void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1070                      uint8* dst_u, uint8* dst_v, int width) {
1071  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1072  asm volatile (
1073  "1:                                          \n"
1074    MEMACCESS(0)
1075    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1076    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1077    MEMACCESS(1)
1078    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1079    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
1080    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
1081    MEMACCESS(2)
1082    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
1083    MEMACCESS(3)
1084    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
1085    "b.gt       1b                             \n"
1086  : "+r"(src_yuy2),     // %0
1087    "+r"(src_yuy2b),    // %1
1088    "+r"(dst_u),        // %2
1089    "+r"(dst_v),        // %3
1090    "+r"(width)           // %4
1091  :
1092  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1093    "v5", "v6", "v7"  // Clobber List
1094  );
1095}
1096
1097void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1098                      uint8* dst_u, uint8* dst_v, int width) {
1099  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1100  asm volatile (
1101  "1:                                          \n"
1102    MEMACCESS(0)
1103    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1104    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1105    MEMACCESS(1)
1106    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1107    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
1108    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
1109    MEMACCESS(2)
1110    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
1111    MEMACCESS(3)
1112    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
1113    "b.gt       1b                             \n"
1114  : "+r"(src_uyvy),     // %0
1115    "+r"(src_uyvyb),    // %1
1116    "+r"(dst_u),        // %2
1117    "+r"(dst_v),        // %3
1118    "+r"(width)           // %4
1119  :
1120  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1121    "v5", "v6", "v7"  // Clobber List
1122  );
1123}
1124
1125// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1126void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1127                         const uint8* shuffler, int width) {
1128  asm volatile (
1129    MEMACCESS(3)
1130    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
1131  "1:                                          \n"
1132    MEMACCESS(0)
1133    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
1134    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
1135    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
1136    MEMACCESS(1)
1137    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
1138    "b.gt       1b                             \n"
1139  : "+r"(src_argb),  // %0
1140    "+r"(dst_argb),  // %1
1141    "+r"(width)        // %2
1142  : "r"(shuffler)    // %3
1143  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1144  );
1145}
1146
1147void I422ToYUY2Row_NEON(const uint8* src_y,
1148                        const uint8* src_u,
1149                        const uint8* src_v,
1150                        uint8* dst_yuy2, int width) {
1151  asm volatile (
1152  "1:                                          \n"
1153    MEMACCESS(0)
1154    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
1155    "orr        v2.8b, v1.8b, v1.8b            \n"
1156    MEMACCESS(1)
1157    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
1158    MEMACCESS(2)
1159    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
1160    "subs       %w4, %w4, #16                  \n"  // 16 pixels
1161    MEMACCESS(3)
1162    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1163    "b.gt       1b                             \n"
1164  : "+r"(src_y),     // %0
1165    "+r"(src_u),     // %1
1166    "+r"(src_v),     // %2
1167    "+r"(dst_yuy2),  // %3
1168    "+r"(width)      // %4
1169  :
1170  : "cc", "memory", "v0", "v1", "v2", "v3"
1171  );
1172}
1173
1174void I422ToUYVYRow_NEON(const uint8* src_y,
1175                        const uint8* src_u,
1176                        const uint8* src_v,
1177                        uint8* dst_uyvy, int width) {
1178  asm volatile (
1179  "1:                                          \n"
1180    MEMACCESS(0)
1181    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
1182    "orr        v3.8b, v2.8b, v2.8b            \n"
1183    MEMACCESS(1)
1184    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
1185    MEMACCESS(2)
1186    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
1187    "subs       %w4, %w4, #16                  \n"  // 16 pixels
1188    MEMACCESS(3)
1189    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1190    "b.gt       1b                             \n"
1191  : "+r"(src_y),     // %0
1192    "+r"(src_u),     // %1
1193    "+r"(src_v),     // %2
1194    "+r"(dst_uyvy),  // %3
1195    "+r"(width)      // %4
1196  :
1197  : "cc", "memory", "v0", "v1", "v2", "v3"
1198  );
1199}
1200
1201void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1202  asm volatile (
1203  "1:                                          \n"
1204    MEMACCESS(0)
1205    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1206    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1207    ARGBTORGB565
1208    MEMACCESS(1)
1209    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
1210    "b.gt       1b                             \n"
1211  : "+r"(src_argb),  // %0
1212    "+r"(dst_rgb565),  // %1
1213    "+r"(width)        // %2
1214  :
1215  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1216  );
1217}
1218
1219void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1220                                const uint32 dither4, int width) {
1221  asm volatile (
1222    "dup        v1.4s, %w2                     \n"  // dither4
1223  "1:                                          \n"
1224    MEMACCESS(1)
1225    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
1226    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1227    "uqadd      v20.8b, v20.8b, v1.8b          \n"
1228    "uqadd      v21.8b, v21.8b, v1.8b          \n"
1229    "uqadd      v22.8b, v22.8b, v1.8b          \n"
1230    ARGBTORGB565
1231    MEMACCESS(0)
1232    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
1233    "b.gt       1b                             \n"
1234  : "+r"(dst_rgb)    // %0
1235  : "r"(src_argb),   // %1
1236    "r"(dither4),    // %2
1237    "r"(width)       // %3
1238  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1239  );
1240}
1241
1242void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1243                            int width) {
1244  asm volatile (
1245  "1:                                          \n"
1246    MEMACCESS(0)
1247    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1248    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1249    ARGBTOARGB1555
1250    MEMACCESS(1)
1251    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
1252    "b.gt       1b                             \n"
1253  : "+r"(src_argb),  // %0
1254    "+r"(dst_argb1555),  // %1
1255    "+r"(width)        // %2
1256  :
1257  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1258  );
1259}
1260
1261void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1262                            int width) {
1263  asm volatile (
1264    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
1265  "1:                                          \n"
1266    MEMACCESS(0)
1267    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1268    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1269    ARGBTOARGB4444
1270    MEMACCESS(1)
1271    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
1272    "b.gt       1b                             \n"
1273  : "+r"(src_argb),      // %0
1274    "+r"(dst_argb4444),  // %1
1275    "+r"(width)            // %2
1276  :
1277  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1278  );
1279}
1280
1281void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1282  asm volatile (
1283    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1284    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1285    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1286    "movi       v7.8b, #16                     \n"  // Add 16 constant
1287  "1:                                          \n"
1288    MEMACCESS(0)
1289    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1290    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1291    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1292    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1293    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1294    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1295    "uqadd      v0.8b, v0.8b, v7.8b            \n"
1296    MEMACCESS(1)
1297    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1298    "b.gt       1b                             \n"
1299  : "+r"(src_argb),  // %0
1300    "+r"(dst_y),     // %1
1301    "+r"(width)        // %2
1302  :
1303  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1304  );
1305}
1306
1307void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
1308  asm volatile (
1309  "1:                                          \n"
1310    MEMACCESS(0)
1311    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels
1312    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
1313    MEMACCESS(1)
1314    "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
1315    "b.gt       1b                             \n"
1316  : "+r"(src_argb),   // %0
1317    "+r"(dst_a),      // %1
1318    "+r"(width)       // %2
1319  :
1320  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1321  );
1322}
1323
1324void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1325  asm volatile (
1326    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
1327    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
1328    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
1329  "1:                                          \n"
1330    MEMACCESS(0)
1331    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1332    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1333    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1334    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1335    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1336    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
1337    MEMACCESS(1)
1338    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1339    "b.gt       1b                             \n"
1340  : "+r"(src_argb),  // %0
1341    "+r"(dst_y),     // %1
1342    "+r"(width)        // %2
1343  :
1344  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1345  );
1346}
1347
1348// 8x1 pixels.
1349void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1350                         int width) {
1351  asm volatile (
1352    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
1353    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
1354    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
1355    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
1356    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
1357    "movi       v29.16b,#0x80                  \n"  // 128.5
1358  "1:                                          \n"
1359    MEMACCESS(0)
1360    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1361    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1362    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
1363    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
1364    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
1365    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
1366
1367    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
1368    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
1369    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
1370    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
1371
1372    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
1373    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1374
1375    MEMACCESS(1)
1376    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1377    MEMACCESS(2)
1378    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1379    "b.gt       1b                             \n"
1380  : "+r"(src_argb),  // %0
1381    "+r"(dst_u),     // %1
1382    "+r"(dst_v),     // %2
1383    "+r"(width)        // %3
1384  :
1385  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1386    "v24", "v25", "v26", "v27", "v28", "v29"
1387  );
1388}
1389
1390#define RGBTOUV_SETUP_REG                                                      \
1391    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
1392    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
1393    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
1394    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
1395    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
1396    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
1397
1398// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
1399void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1400                         int width) {
1401  asm volatile (
1402    RGBTOUV_SETUP_REG
1403  "1:                                          \n"
1404    MEMACCESS(0)
1405    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1406    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1407    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1408    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1409    MEMACCESS(0)
1410    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
1411    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1412    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1413    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1414
1415    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
1416    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
1417    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
1418
1419    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1420    "urshr      v1.8h, v1.8h, #1               \n"
1421    "urshr      v2.8h, v2.8h, #1               \n"
1422
1423    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
1424    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
1425    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
1426    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
1427    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1428    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
1429    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
1430    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
1431    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
1432    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
1433    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
1434    MEMACCESS(1)
1435    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1436    MEMACCESS(2)
1437    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1438    "b.gt       1b                             \n"
1439  : "+r"(src_argb),  // %0
1440    "+r"(dst_u),     // %1
1441    "+r"(dst_v),     // %2
1442    "+r"(width)        // %3
1443  :
1444  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1445    "v20", "v21", "v22", "v23", "v24", "v25"
1446  );
1447}
1448
1449// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1450#define RGBTOUV(QB, QG, QR) \
1451    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
1452    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
1453    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
1454    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
1455    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
1456    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
1457    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
1458    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
1459    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
1460    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
1461
1462// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1463// TODO(fbarchard): consider ptrdiff_t for all strides.
1464
1465void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1466                      uint8* dst_u, uint8* dst_v, int width) {
1467  const uint8* src_argb_1 = src_argb + src_stride_argb;
1468  asm volatile (
1469    RGBTOUV_SETUP_REG
1470  "1:                                          \n"
1471    MEMACCESS(0)
1472    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1473    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1474    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1475    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1476
1477    MEMACCESS(1)
1478    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1479    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1480    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1481    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1482
1483    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1484    "urshr      v1.8h, v1.8h, #1               \n"
1485    "urshr      v2.8h, v2.8h, #1               \n"
1486
1487    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1488    RGBTOUV(v0.8h, v1.8h, v2.8h)
1489    MEMACCESS(2)
1490    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1491    MEMACCESS(3)
1492    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1493    "b.gt       1b                             \n"
1494  : "+r"(src_argb),  // %0
1495    "+r"(src_argb_1),  // %1
1496    "+r"(dst_u),     // %2
1497    "+r"(dst_v),     // %3
1498    "+r"(width)        // %4
1499  :
1500  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1501    "v20", "v21", "v22", "v23", "v24", "v25"
1502  );
1503}
1504
1505// TODO(fbarchard): Subsample match C code.
1506void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1507                       uint8* dst_u, uint8* dst_v, int width) {
1508  const uint8* src_argb_1 = src_argb + src_stride_argb;
1509  asm volatile (
1510    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
1511    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
1512    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
1513    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
1514    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
1515    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1516  "1:                                          \n"
1517    MEMACCESS(0)
1518    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1519    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1520    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1521    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1522    MEMACCESS(1)
1523    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
1524    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1525    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1526    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1527
1528    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1529    "urshr      v1.8h, v1.8h, #1               \n"
1530    "urshr      v2.8h, v2.8h, #1               \n"
1531
1532    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1533    RGBTOUV(v0.8h, v1.8h, v2.8h)
1534    MEMACCESS(2)
1535    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1536    MEMACCESS(3)
1537    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1538    "b.gt       1b                             \n"
1539  : "+r"(src_argb),  // %0
1540    "+r"(src_argb_1),  // %1
1541    "+r"(dst_u),     // %2
1542    "+r"(dst_v),     // %3
1543    "+r"(width)        // %4
1544  :
1545  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1546    "v20", "v21", "v22", "v23", "v24", "v25"
1547  );
1548}
1549
1550void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1551                      uint8* dst_u, uint8* dst_v, int width) {
1552  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1553  asm volatile (
1554    RGBTOUV_SETUP_REG
1555  "1:                                          \n"
1556    MEMACCESS(0)
1557    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1558    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
1559    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1560    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
1561    MEMACCESS(1)
1562    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
1563    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
1564    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1565    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
1566
1567    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1568    "urshr      v1.8h, v3.8h, #1               \n"
1569    "urshr      v2.8h, v2.8h, #1               \n"
1570
1571    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1572    RGBTOUV(v0.8h, v1.8h, v2.8h)
1573    MEMACCESS(2)
1574    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1575    MEMACCESS(3)
1576    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1577    "b.gt       1b                             \n"
1578  : "+r"(src_bgra),  // %0
1579    "+r"(src_bgra_1),  // %1
1580    "+r"(dst_u),     // %2
1581    "+r"(dst_v),     // %3
1582    "+r"(width)        // %4
1583  :
1584  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1585    "v20", "v21", "v22", "v23", "v24", "v25"
1586  );
1587}
1588
1589void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1590                      uint8* dst_u, uint8* dst_v, int width) {
1591  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1592  asm volatile (
1593    RGBTOUV_SETUP_REG
1594  "1:                                          \n"
1595    MEMACCESS(0)
1596    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1597    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1598    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1599    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1600    MEMACCESS(1)
1601    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1602    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1603    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1604    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1605
1606    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
1607    "urshr      v2.8h, v2.8h, #1               \n"
1608    "urshr      v1.8h, v1.8h, #1               \n"
1609
1610    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1611    RGBTOUV(v0.8h, v2.8h, v1.8h)
1612    MEMACCESS(2)
1613    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1614    MEMACCESS(3)
1615    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1616    "b.gt       1b                             \n"
1617  : "+r"(src_abgr),  // %0
1618    "+r"(src_abgr_1),  // %1
1619    "+r"(dst_u),     // %2
1620    "+r"(dst_v),     // %3
1621    "+r"(width)        // %4
1622  :
1623  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1624    "v20", "v21", "v22", "v23", "v24", "v25"
1625  );
1626}
1627
1628void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1629                      uint8* dst_u, uint8* dst_v, int width) {
1630  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1631  asm volatile (
1632    RGBTOUV_SETUP_REG
1633  "1:                                          \n"
1634    MEMACCESS(0)
1635    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1636    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
1637    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1638    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
1639    MEMACCESS(1)
1640    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1641    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
1642    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1643    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
1644
1645    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1646    "urshr      v1.8h, v1.8h, #1               \n"
1647    "urshr      v2.8h, v2.8h, #1               \n"
1648
1649    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1650    RGBTOUV(v0.8h, v1.8h, v2.8h)
1651    MEMACCESS(2)
1652    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1653    MEMACCESS(3)
1654    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1655    "b.gt       1b                             \n"
1656  : "+r"(src_rgba),  // %0
1657    "+r"(src_rgba_1),  // %1
1658    "+r"(dst_u),     // %2
1659    "+r"(dst_v),     // %3
1660    "+r"(width)        // %4
1661  :
1662  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1663    "v20", "v21", "v22", "v23", "v24", "v25"
1664  );
1665}
1666
1667void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1668                       uint8* dst_u, uint8* dst_v, int width) {
1669  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1670  asm volatile (
1671    RGBTOUV_SETUP_REG
1672  "1:                                          \n"
1673    MEMACCESS(0)
1674    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
1675    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1676    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1677    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1678    MEMACCESS(1)
1679    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
1680    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1681    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1682    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1683
1684    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1685    "urshr      v1.8h, v1.8h, #1               \n"
1686    "urshr      v2.8h, v2.8h, #1               \n"
1687
1688    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1689    RGBTOUV(v0.8h, v1.8h, v2.8h)
1690    MEMACCESS(2)
1691    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1692    MEMACCESS(3)
1693    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1694    "b.gt       1b                             \n"
1695  : "+r"(src_rgb24),  // %0
1696    "+r"(src_rgb24_1),  // %1
1697    "+r"(dst_u),     // %2
1698    "+r"(dst_v),     // %3
1699    "+r"(width)        // %4
1700  :
1701  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1702    "v20", "v21", "v22", "v23", "v24", "v25"
1703  );
1704}
1705
1706void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1707                     uint8* dst_u, uint8* dst_v, int width) {
1708  const uint8* src_raw_1 = src_raw + src_stride_raw;
1709  asm volatile (
1710    RGBTOUV_SETUP_REG
1711  "1:                                          \n"
1712    MEMACCESS(0)
1713    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
1714    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1715    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1716    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1717    MEMACCESS(1)
1718    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
1719    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1720    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1721    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1722
1723    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
1724    "urshr      v1.8h, v1.8h, #1               \n"
1725    "urshr      v0.8h, v0.8h, #1               \n"
1726
1727    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1728    RGBTOUV(v2.8h, v1.8h, v0.8h)
1729    MEMACCESS(2)
1730    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1731    MEMACCESS(3)
1732    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1733    "b.gt       1b                             \n"
1734  : "+r"(src_raw),  // %0
1735    "+r"(src_raw_1),  // %1
1736    "+r"(dst_u),     // %2
1737    "+r"(dst_v),     // %3
1738    "+r"(width)        // %4
1739  :
1740  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1741    "v20", "v21", "v22", "v23", "v24", "v25"
1742  );
1743}
1744
1745// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1746void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1747                        uint8* dst_u, uint8* dst_v, int width) {
1748  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1749  asm volatile (
1750    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
1751    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
1752    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
1753    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
1754    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
1755    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1756  "1:                                          \n"
1757    MEMACCESS(0)
1758    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1759    RGB565TOARGB
1760    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1761    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1762    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1763    MEMACCESS(0)
1764    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
1765    RGB565TOARGB
1766    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1767    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1768    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1769
1770    MEMACCESS(1)
1771    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
1772    RGB565TOARGB
1773    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1774    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1775    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1776    MEMACCESS(1)
1777    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
1778    RGB565TOARGB
1779    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1780    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1781    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1782
1783    "ins        v16.D[1], v17.D[0]             \n"
1784    "ins        v18.D[1], v19.D[0]             \n"
1785    "ins        v20.D[1], v21.D[0]             \n"
1786
1787    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1788    "urshr      v5.8h, v18.8h, #1              \n"
1789    "urshr      v6.8h, v20.8h, #1              \n"
1790
1791    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1792    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
1793    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
1794    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
1795    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
1796    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
1797    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
1798    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
1799    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
1800    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
1801    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
1802    MEMACCESS(2)
1803    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1804    MEMACCESS(3)
1805    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1806    "b.gt       1b                             \n"
1807  : "+r"(src_rgb565),  // %0
1808    "+r"(src_rgb565_1),  // %1
1809    "+r"(dst_u),     // %2
1810    "+r"(dst_v),     // %3
1811    "+r"(width)        // %4
1812  :
1813  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1814    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
1815    "v25", "v26", "v27"
1816  );
1817}
1818
1819// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1820void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
1821                        uint8* dst_u, uint8* dst_v, int width) {
1822  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1823  asm volatile (
1824    RGBTOUV_SETUP_REG
1825  "1:                                          \n"
1826    MEMACCESS(0)
1827    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1828    RGB555TOARGB
1829    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1830    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1831    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1832    MEMACCESS(0)
1833    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
1834    RGB555TOARGB
1835    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1836    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1837    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1838
1839    MEMACCESS(1)
1840    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
1841    RGB555TOARGB
1842    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1843    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1844    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1845    MEMACCESS(1)
1846    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
1847    RGB555TOARGB
1848    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1849    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1850    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1851
1852    "ins        v16.D[1], v26.D[0]             \n"
1853    "ins        v17.D[1], v27.D[0]             \n"
1854    "ins        v18.D[1], v28.D[0]             \n"
1855
1856    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1857    "urshr      v5.8h, v17.8h, #1              \n"
1858    "urshr      v6.8h, v18.8h, #1              \n"
1859
1860    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1861    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1862    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1863    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1864    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1865    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1866    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1867    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1868    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1869    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1870    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1871    MEMACCESS(2)
1872    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1873    MEMACCESS(3)
1874    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1875    "b.gt       1b                             \n"
1876  : "+r"(src_argb1555),  // %0
1877    "+r"(src_argb1555_1),  // %1
1878    "+r"(dst_u),     // %2
1879    "+r"(dst_v),     // %3
1880    "+r"(width)        // %4
1881  :
1882  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1883    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1884    "v26", "v27", "v28"
1885  );
1886}
1887
1888// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1889void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
1890                          uint8* dst_u, uint8* dst_v, int width) {
1891  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1892  asm volatile (
1893    RGBTOUV_SETUP_REG
1894  "1:                                          \n"
1895    MEMACCESS(0)
1896    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1897    ARGB4444TOARGB
1898    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1899    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1900    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1901    MEMACCESS(0)
1902    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
1903    ARGB4444TOARGB
1904    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1905    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1906    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1907
1908    MEMACCESS(1)
1909    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
1910    ARGB4444TOARGB
1911    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1912    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1913    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1914    MEMACCESS(1)
1915    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
1916    ARGB4444TOARGB
1917    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1918    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1919    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1920
1921    "ins        v16.D[1], v26.D[0]             \n"
1922    "ins        v17.D[1], v27.D[0]             \n"
1923    "ins        v18.D[1], v28.D[0]             \n"
1924
1925    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1926    "urshr      v5.8h, v17.8h, #1              \n"
1927    "urshr      v6.8h, v18.8h, #1              \n"
1928
1929    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1930    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1931    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1932    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1933    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1934    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1935    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1936    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1937    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1938    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1939    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1940    MEMACCESS(2)
1941    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1942    MEMACCESS(3)
1943    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1944    "b.gt       1b                             \n"
1945  : "+r"(src_argb4444),  // %0
1946    "+r"(src_argb4444_1),  // %1
1947    "+r"(dst_u),     // %2
1948    "+r"(dst_v),     // %3
1949    "+r"(width)        // %4
1950  :
1951  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1952    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1953    "v26", "v27", "v28"
1954
1955  );
1956}
1957
1958void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
1959  asm volatile (
1960    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
1961    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
1962    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
1963    "movi       v27.8b, #16                    \n"  // Add 16 constant
1964  "1:                                          \n"
1965    MEMACCESS(0)
1966    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1967    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1968    RGB565TOARGB
1969    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
1970    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
1971    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
1972    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1973    "uqadd      v0.8b, v0.8b, v27.8b           \n"
1974    MEMACCESS(1)
1975    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1976    "b.gt       1b                             \n"
1977  : "+r"(src_rgb565),  // %0
1978    "+r"(dst_y),       // %1
1979    "+r"(width)          // %2
1980  :
1981  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
1982    "v24", "v25", "v26", "v27"
1983  );
1984}
1985
1986void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
1987  asm volatile (
1988    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1989    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1990    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1991    "movi       v7.8b, #16                     \n"  // Add 16 constant
1992  "1:                                          \n"
1993    MEMACCESS(0)
1994    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1995    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1996    ARGB1555TOARGB
1997    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1998    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1999    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
2000    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2001    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2002    MEMACCESS(1)
2003    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2004    "b.gt       1b                             \n"
2005  : "+r"(src_argb1555),  // %0
2006    "+r"(dst_y),         // %1
2007    "+r"(width)            // %2
2008  :
2009  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2010  );
2011}
2012
2013void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
2014  asm volatile (
2015    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
2016    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
2017    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
2018    "movi       v27.8b, #16                    \n"  // Add 16 constant
2019  "1:                                          \n"
2020    MEMACCESS(0)
2021    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
2022    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2023    ARGB4444TOARGB
2024    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
2025    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
2026    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
2027    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2028    "uqadd      v0.8b, v0.8b, v27.8b           \n"
2029    MEMACCESS(1)
2030    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2031    "b.gt       1b                             \n"
2032  : "+r"(src_argb4444),  // %0
2033    "+r"(dst_y),         // %1
2034    "+r"(width)            // %2
2035  :
2036  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2037  );
2038}
2039
2040void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2041  asm volatile (
2042    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2043    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2044    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2045    "movi       v7.8b, #16                     \n"  // Add 16 constant
2046  "1:                                          \n"
2047    MEMACCESS(0)
2048    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2049    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2050    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
2051    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2052    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
2053    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2054    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2055    MEMACCESS(1)
2056    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2057    "b.gt       1b                             \n"
2058  : "+r"(src_bgra),  // %0
2059    "+r"(dst_y),     // %1
2060    "+r"(width)        // %2
2061  :
2062  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2063  );
2064}
2065
2066void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2067  asm volatile (
2068    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2069    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2070    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2071    "movi       v7.8b, #16                     \n"  // Add 16 constant
2072  "1:                                          \n"
2073    MEMACCESS(0)
2074    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2075    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2076    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
2077    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2078    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
2079    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2080    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2081    MEMACCESS(1)
2082    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2083    "b.gt       1b                             \n"
2084  : "+r"(src_abgr),  // %0
2085    "+r"(dst_y),     // %1
2086    "+r"(width)        // %2
2087  :
2088  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2089  );
2090}
2091
2092void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2093  asm volatile (
2094    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2095    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2096    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2097    "movi       v7.8b, #16                     \n"  // Add 16 constant
2098  "1:                                          \n"
2099    MEMACCESS(0)
2100    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2101    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2102    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
2103    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2104    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
2105    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2106    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2107    MEMACCESS(1)
2108    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2109    "b.gt       1b                             \n"
2110  : "+r"(src_rgba),  // %0
2111    "+r"(dst_y),     // %1
2112    "+r"(width)        // %2
2113  :
2114  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2115  );
2116}
2117
2118void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2119  asm volatile (
2120    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2121    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2122    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2123    "movi       v7.8b, #16                     \n"  // Add 16 constant
2124  "1:                                          \n"
2125    MEMACCESS(0)
2126    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2127    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2128    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2129    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2130    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2131    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2132    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2133    MEMACCESS(1)
2134    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2135    "b.gt       1b                             \n"
2136  : "+r"(src_rgb24),  // %0
2137    "+r"(dst_y),      // %1
2138    "+r"(width)         // %2
2139  :
2140  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2141  );
2142}
2143
2144void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2145  asm volatile (
2146    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2147    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2148    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2149    "movi       v7.8b, #16                     \n"  // Add 16 constant
2150  "1:                                          \n"
2151    MEMACCESS(0)
2152    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2153    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2154    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2155    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2156    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2157    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2158    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2159    MEMACCESS(1)
2160    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2161    "b.gt       1b                             \n"
2162  : "+r"(src_raw),  // %0
2163    "+r"(dst_y),    // %1
2164    "+r"(width)       // %2
2165  :
2166  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2167  );
2168}
2169
2170// Bilinear filter 16x2 -> 16x1
2171void InterpolateRow_NEON(uint8* dst_ptr,
2172                         const uint8* src_ptr, ptrdiff_t src_stride,
2173                         int dst_width, int source_y_fraction) {
2174  int y1_fraction = source_y_fraction;
2175  int y0_fraction = 256 - y1_fraction;
2176  const uint8* src_ptr1 = src_ptr + src_stride;
2177  asm volatile (
2178    "cmp        %w4, #0                        \n"
2179    "b.eq       100f                           \n"
2180    "cmp        %w4, #128                      \n"
2181    "b.eq       50f                            \n"
2182
2183    "dup        v5.16b, %w4                    \n"
2184    "dup        v4.16b, %w5                    \n"
2185    // General purpose row blend.
2186  "1:                                          \n"
2187    MEMACCESS(1)
2188    "ld1        {v0.16b}, [%1], #16            \n"
2189    MEMACCESS(2)
2190    "ld1        {v1.16b}, [%2], #16            \n"
2191    "subs       %w3, %w3, #16                  \n"
2192    "umull      v2.8h, v0.8b,  v4.8b           \n"
2193    "umull2     v3.8h, v0.16b, v4.16b          \n"
2194    "umlal      v2.8h, v1.8b,  v5.8b           \n"
2195    "umlal2     v3.8h, v1.16b, v5.16b          \n"
2196    "rshrn      v0.8b,  v2.8h, #8              \n"
2197    "rshrn2     v0.16b, v3.8h, #8              \n"
2198    MEMACCESS(0)
2199    "st1        {v0.16b}, [%0], #16            \n"
2200    "b.gt       1b                             \n"
2201    "b          99f                            \n"
2202
2203    // Blend 50 / 50.
2204  "50:                                         \n"
2205    MEMACCESS(1)
2206    "ld1        {v0.16b}, [%1], #16            \n"
2207    MEMACCESS(2)
2208    "ld1        {v1.16b}, [%2], #16            \n"
2209    "subs       %w3, %w3, #16                  \n"
2210    "urhadd     v0.16b, v0.16b, v1.16b         \n"
2211    MEMACCESS(0)
2212    "st1        {v0.16b}, [%0], #16            \n"
2213    "b.gt       50b                            \n"
2214    "b          99f                            \n"
2215
2216    // Blend 100 / 0 - Copy row unchanged.
2217  "100:                                        \n"
2218    MEMACCESS(1)
2219    "ld1        {v0.16b}, [%1], #16            \n"
2220    "subs       %w3, %w3, #16                  \n"
2221    MEMACCESS(0)
2222    "st1        {v0.16b}, [%0], #16            \n"
2223    "b.gt       100b                           \n"
2224
2225  "99:                                         \n"
2226  : "+r"(dst_ptr),          // %0
2227    "+r"(src_ptr),          // %1
2228    "+r"(src_ptr1),         // %2
2229    "+r"(dst_width),        // %3
2230    "+r"(y1_fraction),      // %4
2231    "+r"(y0_fraction)       // %5
2232  :
2233  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2234  );
2235}
2236
2237// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2238void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2239                       uint8* dst_argb, int width) {
2240  asm volatile (
2241    "subs       %w3, %w3, #8                   \n"
2242    "b.lt       89f                            \n"
2243    // Blend 8 pixels.
2244  "8:                                          \n"
2245    MEMACCESS(0)
2246    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
2247    MEMACCESS(1)
2248    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
2249    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2250    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2251    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2252    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2253    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2254    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2255    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2256    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2257    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2258    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2259    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2260    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2261    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2262    "movi       v3.8b, #255                    \n"  // a = 255
2263    MEMACCESS(2)
2264    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2265    "b.ge       8b                             \n"
2266
2267  "89:                                         \n"
2268    "adds       %w3, %w3, #8-1                 \n"
2269    "b.lt       99f                            \n"
2270
2271    // Blend 1 pixels.
2272  "1:                                          \n"
2273    MEMACCESS(0)
2274    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
2275    MEMACCESS(1)
2276    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
2277    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
2278    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2279    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2280    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2281    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2282    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2283    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2284    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2285    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2286    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2287    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2288    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2289    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2290    "movi       v3.8b, #255                    \n"  // a = 255
2291    MEMACCESS(2)
2292    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
2293    "b.ge       1b                             \n"
2294
2295  "99:                                         \n"
2296
2297  : "+r"(src_argb0),    // %0
2298    "+r"(src_argb1),    // %1
2299    "+r"(dst_argb),     // %2
2300    "+r"(width)         // %3
2301  :
2302  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2303    "v16", "v17", "v18"
2304  );
2305}
2306
2307// Attenuate 8 pixels at a time.
2308void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2309  asm volatile (
2310    // Attenuate 8 pixels.
2311  "1:                                          \n"
2312    MEMACCESS(0)
2313    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
2314    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2315    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
2316    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
2317    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
2318    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
2319    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
2320    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
2321    MEMACCESS(1)
2322    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2323    "b.gt       1b                             \n"
2324  : "+r"(src_argb),   // %0
2325    "+r"(dst_argb),   // %1
2326    "+r"(width)       // %2
2327  :
2328  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2329  );
2330}
2331
2332// Quantize 8 ARGB pixels (32 bytes).
2333// dst = (dst * scale >> 16) * interval_size + interval_offset;
2334void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2335                          int interval_offset, int width) {
2336  asm volatile (
2337    "dup        v4.8h, %w2                     \n"
2338    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
2339    "dup        v5.8h, %w3                     \n"  // interval multiply.
2340    "dup        v6.8h, %w4                     \n"  // interval add
2341
2342    // 8 pixel loop.
2343  "1:                                          \n"
2344    MEMACCESS(0)
2345    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
2346    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2347    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
2348    "uxtl       v1.8h, v1.8b                   \n"
2349    "uxtl       v2.8h, v2.8b                   \n"
2350    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
2351    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
2352    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
2353    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
2354    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
2355    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
2356    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
2357    "add        v1.8h, v1.8h, v6.8h            \n"  // g
2358    "add        v2.8h, v2.8h, v6.8h            \n"  // r
2359    "uqxtn      v0.8b, v0.8h                   \n"
2360    "uqxtn      v1.8b, v1.8h                   \n"
2361    "uqxtn      v2.8b, v2.8h                   \n"
2362    MEMACCESS(0)
2363    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
2364    "b.gt       1b                             \n"
2365  : "+r"(dst_argb),       // %0
2366    "+r"(width)           // %1
2367  : "r"(scale),           // %2
2368    "r"(interval_size),   // %3
2369    "r"(interval_offset)  // %4
2370  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2371  );
2372}
2373
2374// Shade 8 pixels at a time by specified value.
2375// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2376// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2377void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2378                       uint32 value) {
2379  asm volatile (
2380    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
2381    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
2382    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
2383
2384    // 8 pixel loop.
2385  "1:                                          \n"
2386    MEMACCESS(0)
2387    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2388    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2389    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
2390    "uxtl       v5.8h, v5.8b                   \n"
2391    "uxtl       v6.8h, v6.8b                   \n"
2392    "uxtl       v7.8h, v7.8b                   \n"
2393    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
2394    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
2395    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
2396    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
2397    "uqxtn      v4.8b, v4.8h                   \n"
2398    "uqxtn      v5.8b, v5.8h                   \n"
2399    "uqxtn      v6.8b, v6.8h                   \n"
2400    "uqxtn      v7.8b, v7.8h                   \n"
2401    MEMACCESS(1)
2402    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2403    "b.gt       1b                             \n"
2404  : "+r"(src_argb),       // %0
2405    "+r"(dst_argb),       // %1
2406    "+r"(width)           // %2
2407  : "r"(value)            // %3
2408  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2409  );
2410}
2411
2412// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2413// Similar to ARGBToYJ but stores ARGB.
2414// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2415void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2416  asm volatile (
2417    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
2418    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
2419    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
2420  "1:                                          \n"
2421    MEMACCESS(0)
2422    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2423    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2424    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
2425    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
2426    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
2427    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
2428    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
2429    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
2430    MEMACCESS(1)
2431    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
2432    "b.gt       1b                             \n"
2433  : "+r"(src_argb),  // %0
2434    "+r"(dst_argb),  // %1
2435    "+r"(width)      // %2
2436  :
2437  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2438  );
2439}
2440
2441// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2442//    b = (r * 35 + g * 68 + b * 17) >> 7
2443//    g = (r * 45 + g * 88 + b * 22) >> 7
2444//    r = (r * 50 + g * 98 + b * 24) >> 7
2445
2446void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2447  asm volatile (
2448    "movi       v20.8b, #17                    \n"  // BB coefficient
2449    "movi       v21.8b, #68                    \n"  // BG coefficient
2450    "movi       v22.8b, #35                    \n"  // BR coefficient
2451    "movi       v24.8b, #22                    \n"  // GB coefficient
2452    "movi       v25.8b, #88                    \n"  // GG coefficient
2453    "movi       v26.8b, #45                    \n"  // GR coefficient
2454    "movi       v28.8b, #24                    \n"  // BB coefficient
2455    "movi       v29.8b, #98                    \n"  // BG coefficient
2456    "movi       v30.8b, #50                    \n"  // BR coefficient
2457  "1:                                          \n"
2458    MEMACCESS(0)
2459    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
2460    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2461    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
2462    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
2463    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
2464    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
2465    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
2466    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
2467    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
2468    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
2469    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
2470    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
2471    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
2472    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
2473    MEMACCESS(0)
2474    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
2475    "b.gt       1b                             \n"
2476  : "+r"(dst_argb),  // %0
2477    "+r"(width)      // %1
2478  :
2479  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2480    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2481  );
2482}
2483
2484// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2485// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2486// needs to saturate.  Consider doing a non-saturating version.
2487void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2488                             const int8* matrix_argb, int width) {
2489  asm volatile (
2490    MEMACCESS(3)
2491    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
2492    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
2493    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
2494
2495  "1:                                          \n"
2496    MEMACCESS(0)
2497    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
2498    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2499    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
2500    "uxtl       v17.8h, v17.8b                 \n"  // g
2501    "uxtl       v18.8h, v18.8b                 \n"  // r
2502    "uxtl       v19.8h, v19.8b                 \n"  // a
2503    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
2504    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
2505    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
2506    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
2507    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
2508    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
2509    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
2510    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
2511    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2512    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2513    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2514    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2515    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
2516    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
2517    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
2518    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
2519    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2520    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2521    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2522    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2523    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
2524    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
2525    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
2526    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
2527    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2528    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2529    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2530    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2531    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
2532    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
2533    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
2534    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
2535    MEMACCESS(1)
2536    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
2537    "b.gt       1b                             \n"
2538  : "+r"(src_argb),   // %0
2539    "+r"(dst_argb),   // %1
2540    "+r"(width)       // %2
2541  : "r"(matrix_argb)  // %3
2542  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2543    "v18", "v19", "v22", "v23", "v24", "v25"
2544  );
2545}
2546
2547// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2548// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2549void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2550                          uint8* dst_argb, int width) {
2551  asm volatile (
2552    // 8 pixel loop.
2553  "1:                                          \n"
2554    MEMACCESS(0)
2555    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2556    MEMACCESS(1)
2557    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2558    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2559    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
2560    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
2561    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
2562    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
2563    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
2564    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
2565    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
2566    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
2567    MEMACCESS(2)
2568    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2569    "b.gt       1b                             \n"
2570
2571  : "+r"(src_argb0),  // %0
2572    "+r"(src_argb1),  // %1
2573    "+r"(dst_argb),   // %2
2574    "+r"(width)       // %3
2575  :
2576  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2577  );
2578}
2579
2580// Add 2 rows of ARGB pixels together, 8 pixels at a time.
2581void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2582                     uint8* dst_argb, int width) {
2583  asm volatile (
2584    // 8 pixel loop.
2585  "1:                                          \n"
2586    MEMACCESS(0)
2587    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2588    MEMACCESS(1)
2589    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2590    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2591    "uqadd      v0.8b, v0.8b, v4.8b            \n"
2592    "uqadd      v1.8b, v1.8b, v5.8b            \n"
2593    "uqadd      v2.8b, v2.8b, v6.8b            \n"
2594    "uqadd      v3.8b, v3.8b, v7.8b            \n"
2595    MEMACCESS(2)
2596    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2597    "b.gt       1b                             \n"
2598
2599  : "+r"(src_argb0),  // %0
2600    "+r"(src_argb1),  // %1
2601    "+r"(dst_argb),   // %2
2602    "+r"(width)       // %3
2603  :
2604  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2605  );
2606}
2607
2608// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2609void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2610                          uint8* dst_argb, int width) {
2611  asm volatile (
2612    // 8 pixel loop.
2613  "1:                                          \n"
2614    MEMACCESS(0)
2615    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2616    MEMACCESS(1)
2617    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2618    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2619    "uqsub      v0.8b, v0.8b, v4.8b            \n"
2620    "uqsub      v1.8b, v1.8b, v5.8b            \n"
2621    "uqsub      v2.8b, v2.8b, v6.8b            \n"
2622    "uqsub      v3.8b, v3.8b, v7.8b            \n"
2623    MEMACCESS(2)
2624    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2625    "b.gt       1b                             \n"
2626
2627  : "+r"(src_argb0),  // %0
2628    "+r"(src_argb1),  // %1
2629    "+r"(dst_argb),   // %2
2630    "+r"(width)       // %3
2631  :
2632  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2633  );
2634}
2635
2636// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2637// A = 255
2638// R = Sobel
2639// G = Sobel
2640// B = Sobel
2641void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2642                     uint8* dst_argb, int width) {
2643  asm volatile (
2644    "movi       v3.8b, #255                    \n"  // alpha
2645    // 8 pixel loop.
2646  "1:                                          \n"
2647    MEMACCESS(0)
2648    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
2649    MEMACCESS(1)
2650    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
2651    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2652    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
2653    "orr        v1.8b, v0.8b, v0.8b            \n"
2654    "orr        v2.8b, v0.8b, v0.8b            \n"
2655    MEMACCESS(2)
2656    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2657    "b.gt       1b                             \n"
2658  : "+r"(src_sobelx),  // %0
2659    "+r"(src_sobely),  // %1
2660    "+r"(dst_argb),    // %2
2661    "+r"(width)        // %3
2662  :
2663  : "cc", "memory", "v0", "v1", "v2", "v3"
2664  );
2665}
2666
2667// Adds Sobel X and Sobel Y and stores Sobel into plane.
2668void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2669                          uint8* dst_y, int width) {
2670  asm volatile (
2671    // 16 pixel loop.
2672  "1:                                          \n"
2673    MEMACCESS(0)
2674    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
2675    MEMACCESS(1)
2676    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
2677    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
2678    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
2679    MEMACCESS(2)
2680    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
2681    "b.gt       1b                             \n"
2682  : "+r"(src_sobelx),  // %0
2683    "+r"(src_sobely),  // %1
2684    "+r"(dst_y),       // %2
2685    "+r"(width)        // %3
2686  :
2687  : "cc", "memory", "v0", "v1"
2688  );
2689}
2690
2691// Mixes Sobel X, Sobel Y and Sobel into ARGB.
2692// A = 255
2693// R = Sobel X
2694// G = Sobel
2695// B = Sobel Y
2696void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2697                     uint8* dst_argb, int width) {
2698  asm volatile (
2699    "movi       v3.8b, #255                    \n"  // alpha
2700    // 8 pixel loop.
2701  "1:                                          \n"
2702    MEMACCESS(0)
2703    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
2704    MEMACCESS(1)
2705    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
2706    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2707    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
2708    MEMACCESS(2)
2709    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2710    "b.gt       1b                             \n"
2711  : "+r"(src_sobelx),  // %0
2712    "+r"(src_sobely),  // %1
2713    "+r"(dst_argb),    // %2
2714    "+r"(width)        // %3
2715  :
2716  : "cc", "memory", "v0", "v1", "v2", "v3"
2717  );
2718}
2719
2720// SobelX as a matrix is
2721// -1  0  1
2722// -2  0  2
2723// -1  0  1
2724void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
2725                    const uint8* src_y2, uint8* dst_sobelx, int width) {
2726  asm volatile (
2727  "1:                                          \n"
2728    MEMACCESS(0)
2729    "ld1        {v0.8b}, [%0],%5               \n"  // top
2730    MEMACCESS(0)
2731    "ld1        {v1.8b}, [%0],%6               \n"
2732    "usubl      v0.8h, v0.8b, v1.8b            \n"
2733    MEMACCESS(1)
2734    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
2735    MEMACCESS(1)
2736    "ld1        {v3.8b}, [%1],%6               \n"
2737    "usubl      v1.8h, v2.8b, v3.8b            \n"
2738    "add        v0.8h, v0.8h, v1.8h            \n"
2739    "add        v0.8h, v0.8h, v1.8h            \n"
2740    MEMACCESS(2)
2741    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
2742    MEMACCESS(2)
2743    "ld1        {v3.8b}, [%2],%6               \n"
2744    "subs       %w4, %w4, #8                   \n"  // 8 pixels
2745    "usubl      v1.8h, v2.8b, v3.8b            \n"
2746    "add        v0.8h, v0.8h, v1.8h            \n"
2747    "abs        v0.8h, v0.8h                   \n"
2748    "uqxtn      v0.8b, v0.8h                   \n"
2749    MEMACCESS(3)
2750    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
2751    "b.gt       1b                             \n"
2752  : "+r"(src_y0),      // %0
2753    "+r"(src_y1),      // %1
2754    "+r"(src_y2),      // %2
2755    "+r"(dst_sobelx),  // %3
2756    "+r"(width)        // %4
2757  : "r"(2LL),          // %5
2758    "r"(6LL)           // %6
2759  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2760  );
2761}
2762
2763// SobelY as a matrix is
2764// -1 -2 -1
2765//  0  0  0
2766//  1  2  1
2767void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
2768                    uint8* dst_sobely, int width) {
2769  asm volatile (
2770  "1:                                          \n"
2771    MEMACCESS(0)
2772    "ld1        {v0.8b}, [%0],%4               \n"  // left
2773    MEMACCESS(1)
2774    "ld1        {v1.8b}, [%1],%4               \n"
2775    "usubl      v0.8h, v0.8b, v1.8b            \n"
2776    MEMACCESS(0)
2777    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
2778    MEMACCESS(1)
2779    "ld1        {v3.8b}, [%1],%4               \n"
2780    "usubl      v1.8h, v2.8b, v3.8b            \n"
2781    "add        v0.8h, v0.8h, v1.8h            \n"
2782    "add        v0.8h, v0.8h, v1.8h            \n"
2783    MEMACCESS(0)
2784    "ld1        {v2.8b}, [%0],%5               \n"  // right
2785    MEMACCESS(1)
2786    "ld1        {v3.8b}, [%1],%5               \n"
2787    "subs       %w3, %w3, #8                   \n"  // 8 pixels
2788    "usubl      v1.8h, v2.8b, v3.8b            \n"
2789    "add        v0.8h, v0.8h, v1.8h            \n"
2790    "abs        v0.8h, v0.8h                   \n"
2791    "uqxtn      v0.8b, v0.8h                   \n"
2792    MEMACCESS(2)
2793    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
2794    "b.gt       1b                             \n"
2795  : "+r"(src_y0),      // %0
2796    "+r"(src_y1),      // %1
2797    "+r"(dst_sobely),  // %2
2798    "+r"(width)        // %3
2799  : "r"(1LL),          // %4
2800    "r"(6LL)           // %5
2801  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2802  );
2803}
2804#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
2805
2806#ifdef __cplusplus
2807}  // extern "C"
2808}  // namespace libyuv
2809#endif
2810