1/*
2 *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC Neon armv8 64 bit.
19#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21// Read 8 Y, 4 U and 4 V from 422
22#define READYUV422                                                             \
23    MEMACCESS(0)                                                               \
24    "ld1        {v0.8b}, [%0], #8              \n"                             \
25    MEMACCESS(1)                                                               \
26    "ld1        {v1.s}[0], [%1], #4            \n"                             \
27    MEMACCESS(2)                                                               \
28    "ld1        {v1.s}[1], [%2], #4            \n"
29
30// Read 8 Y, 2 U and 2 V from 422
31#define READYUV411                                                             \
32    MEMACCESS(0)                                                               \
33    "ld1        {v0.8b}, [%0], #8              \n"                             \
34    MEMACCESS(1)                                                               \
35    "ld1        {v2.h}[0], [%1], #2            \n"                             \
36    MEMACCESS(2)                                                               \
37    "ld1        {v2.h}[1], [%2], #2            \n"                             \
38    "zip1       v1.8b, v2.8b, v2.8b            \n"
39
40// Read 8 Y, 8 U and 8 V from 444
41#define READYUV444                                                             \
42    MEMACCESS(0)                                                               \
43    "ld1        {v0.8b}, [%0], #8              \n"                             \
44    MEMACCESS(1)                                                               \
45    "ld1        {v1.d}[0], [%1], #8            \n"                             \
46    MEMACCESS(2)                                                               \
47    "ld1        {v1.d}[1], [%2], #8            \n"                             \
48    "uaddlp     v1.8h, v1.16b                  \n"                             \
49    "rshrn      v1.8b, v1.8h, #1               \n"
50
51// Read 8 Y, and set 4 U and 4 V to 128
52#define READYUV400                                                             \
53    MEMACCESS(0)                                                               \
54    "ld1        {v0.8b}, [%0], #8              \n"                             \
55    "movi       v1.8b , #128                   \n"
56
57// Read 8 Y and 4 UV from NV12
58#define READNV12                                                               \
59    MEMACCESS(0)                                                               \
60    "ld1        {v0.8b}, [%0], #8              \n"                             \
61    MEMACCESS(1)                                                               \
62    "ld1        {v2.8b}, [%1], #8              \n"                             \
63    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
64    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
65    "ins        v1.s[1], v3.s[0]               \n"
66
67// Read 8 Y and 4 VU from NV21
68#define READNV21                                                               \
69    MEMACCESS(0)                                                               \
70    "ld1        {v0.8b}, [%0], #8              \n"                             \
71    MEMACCESS(1)                                                               \
72    "ld1        {v2.8b}, [%1], #8              \n"                             \
73    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
74    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
75    "ins        v1.s[1], v3.s[0]               \n"
76
77// Read 8 YUY2
78#define READYUY2                                                               \
79    MEMACCESS(0)                                                               \
80    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
81    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
82    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
83    "ins        v1.s[1], v3.s[0]               \n"
84
85// Read 8 UYVY
86#define READUYVY                                                               \
87    MEMACCESS(0)                                                               \
88    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
89    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
90    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
91    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
92    "ins        v1.s[1], v3.s[0]               \n"
93
94#define YUV422TORGB_SETUP_REG                                                  \
95    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
96    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
97    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
98    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
99    "movi       v27.8h, #128                   \n"                             \
100    "movi       v28.8h, #102                   \n"                             \
101    "movi       v29.8h, #25                    \n"                             \
102    "movi       v30.8h, #52                    \n"
103
104#define YUV422TORGB(vR, vG, vB)                                                \
105    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
106    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
107    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
108    "ushll      v0.4s, v0.4h, #0               \n"                             \
109    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
110    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
111    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
112    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
113    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
114    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
115    "uxtl       v2.8h, v2.8b                   \n"                             \
116    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
117    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
118    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
119    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
120    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
121    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
122    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
123    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
124    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
125    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
126    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
127    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
128    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
129    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
130    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
131
132// YUV to RGB conversion constants.
133// Y contribution to R,G,B.  Scale and bias.
134#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
135#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
136
137// U and V contributions to R,G,B.
138#define UB -128 /* -min(128, round(2.018 * 64)) */
139#define UG 25 /* -round(-0.391 * 64) */
140#define VG 52 /* -round(-0.813 * 64) */
141#define VR -102 /* -round(1.596 * 64) */
142
143// Bias values to subtract 16 from Y and 128 from U and V.
144#define BB (UB * 128            - YGB)
145#define BG (UG * 128 + VG * 128 - YGB)
146#define BR            (VR * 128 - YGB)
147
148static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
149static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
150
151#undef YG
152#undef YGB
153#undef UB
154#undef UG
155#undef VG
156#undef VR
157#undef BB
158#undef BG
159#undef BR
160
161#define RGBTOUV_SETUP_REG                                                      \
162    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
163    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
164    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
165    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
166    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
167    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
168
169
170#ifdef HAS_I444TOARGBROW_NEON
171void I444ToARGBRow_NEON(const uint8* src_y,
172                        const uint8* src_u,
173                        const uint8* src_v,
174                        uint8* dst_argb,
175                        int width) {
176  asm volatile (
177    YUV422TORGB_SETUP_REG
178  "1:                                          \n"
179    READYUV444
180    YUV422TORGB(v22, v21, v20)
181    "subs       %w4, %w4, #8                 \n"
182    "movi       v23.8b, #255                   \n" /* A */
183    MEMACCESS(3)
184    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
185    "b.gt       1b                             \n"
186    : "+r"(src_y),     // %0
187      "+r"(src_u),     // %1
188      "+r"(src_v),     // %2
189      "+r"(dst_argb),  // %3
190      "+r"(width)      // %4
191    : [kUVBiasBGR]"r"(&kUVBiasBGR),
192      [kYToRgb]"r"(&kYToRgb)
193    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
194      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
195  );
196}
197#endif  // HAS_I444TOARGBROW_NEON
198
199#ifdef HAS_I422TOARGBROW_NEON
200void I422ToARGBRow_NEON(const uint8* src_y,
201                        const uint8* src_u,
202                        const uint8* src_v,
203                        uint8* dst_argb,
204                        int width) {
205  asm volatile (
206    YUV422TORGB_SETUP_REG
207  "1:                                          \n"
208    READYUV422
209    YUV422TORGB(v22, v21, v20)
210    "subs       %w4, %w4, #8                   \n"
211    "movi       v23.8b, #255                   \n" /* A */
212    MEMACCESS(3)
213    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
214    "b.gt       1b                             \n"
215    : "+r"(src_y),     // %0
216      "+r"(src_u),     // %1
217      "+r"(src_v),     // %2
218      "+r"(dst_argb),  // %3
219      "+r"(width)      // %4
220    : [kUVBiasBGR]"r"(&kUVBiasBGR),
221      [kYToRgb]"r"(&kYToRgb)
222    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
223      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
224  );
225}
226#endif  // HAS_I422TOARGBROW_NEON
227
228#ifdef HAS_I411TOARGBROW_NEON
229void I411ToARGBRow_NEON(const uint8* src_y,
230                        const uint8* src_u,
231                        const uint8* src_v,
232                        uint8* dst_argb,
233                        int width) {
234  asm volatile (
235    YUV422TORGB_SETUP_REG
236  "1:                                          \n"
237    READYUV411
238    YUV422TORGB(v22, v21, v20)
239    "subs       %w4, %w4, #8                   \n"
240    "movi       v23.8b, #255                   \n" /* A */
241    MEMACCESS(3)
242    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
243    "b.gt       1b                             \n"
244    : "+r"(src_y),     // %0
245      "+r"(src_u),     // %1
246      "+r"(src_v),     // %2
247      "+r"(dst_argb),  // %3
248      "+r"(width)      // %4
249    : [kUVBiasBGR]"r"(&kUVBiasBGR),
250      [kYToRgb]"r"(&kYToRgb)
251    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
252      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
253  );
254}
255#endif  // HAS_I411TOARGBROW_NEON
256
257#ifdef HAS_I422TOBGRAROW_NEON
258void I422ToBGRARow_NEON(const uint8* src_y,
259                        const uint8* src_u,
260                        const uint8* src_v,
261                        uint8* dst_bgra,
262                        int width) {
263  asm volatile (
264    YUV422TORGB_SETUP_REG
265  "1:                                          \n"
266    READYUV422
267    YUV422TORGB(v21, v22, v23)
268    "subs       %w4, %w4, #8                   \n"
269    "movi       v20.8b, #255                   \n" /* A */
270    MEMACCESS(3)
271    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
272    "b.gt       1b                             \n"
273    : "+r"(src_y),     // %0
274      "+r"(src_u),     // %1
275      "+r"(src_v),     // %2
276      "+r"(dst_bgra),  // %3
277      "+r"(width)      // %4
278    : [kUVBiasBGR]"r"(&kUVBiasBGR),
279      [kYToRgb]"r"(&kYToRgb)
280    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
281      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
282  );
283}
284#endif  // HAS_I422TOBGRAROW_NEON
285
286#ifdef HAS_I422TOABGRROW_NEON
287void I422ToABGRRow_NEON(const uint8* src_y,
288                        const uint8* src_u,
289                        const uint8* src_v,
290                        uint8* dst_abgr,
291                        int width) {
292  asm volatile (
293    YUV422TORGB_SETUP_REG
294  "1:                                          \n"
295    READYUV422
296    YUV422TORGB(v20, v21, v22)
297    "subs       %w4, %w4, #8                   \n"
298    "movi       v23.8b, #255                   \n" /* A */
299    MEMACCESS(3)
300    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
301    "b.gt       1b                             \n"
302    : "+r"(src_y),     // %0
303      "+r"(src_u),     // %1
304      "+r"(src_v),     // %2
305      "+r"(dst_abgr),  // %3
306      "+r"(width)      // %4
307    : [kUVBiasBGR]"r"(&kUVBiasBGR),
308      [kYToRgb]"r"(&kYToRgb)
309    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
310      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
311  );
312}
313#endif  // HAS_I422TOABGRROW_NEON
314
315#ifdef HAS_I422TORGBAROW_NEON
316void I422ToRGBARow_NEON(const uint8* src_y,
317                        const uint8* src_u,
318                        const uint8* src_v,
319                        uint8* dst_rgba,
320                        int width) {
321  asm volatile (
322    YUV422TORGB_SETUP_REG
323  "1:                                          \n"
324    READYUV422
325    YUV422TORGB(v23, v22, v21)
326    "subs       %w4, %w4, #8                   \n"
327    "movi       v20.8b, #255                   \n" /* A */
328    MEMACCESS(3)
329    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
330    "b.gt       1b                             \n"
331    : "+r"(src_y),     // %0
332      "+r"(src_u),     // %1
333      "+r"(src_v),     // %2
334      "+r"(dst_rgba),  // %3
335      "+r"(width)      // %4
336    : [kUVBiasBGR]"r"(&kUVBiasBGR),
337      [kYToRgb]"r"(&kYToRgb)
338    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
339      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
340  );
341}
342#endif  // HAS_I422TORGBAROW_NEON
343
344#ifdef HAS_I422TORGB24ROW_NEON
345void I422ToRGB24Row_NEON(const uint8* src_y,
346                         const uint8* src_u,
347                         const uint8* src_v,
348                         uint8* dst_rgb24,
349                         int width) {
350  asm volatile (
351    YUV422TORGB_SETUP_REG
352  "1:                                          \n"
353    READYUV422
354    YUV422TORGB(v22, v21, v20)
355    "subs       %w4, %w4, #8                   \n"
356    MEMACCESS(3)
357    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
358    "b.gt       1b                             \n"
359    : "+r"(src_y),     // %0
360      "+r"(src_u),     // %1
361      "+r"(src_v),     // %2
362      "+r"(dst_rgb24), // %3
363      "+r"(width)      // %4
364    : [kUVBiasBGR]"r"(&kUVBiasBGR),
365      [kYToRgb]"r"(&kYToRgb)
366    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
367      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
368  );
369}
370#endif  // HAS_I422TORGB24ROW_NEON
371
372#ifdef HAS_I422TORAWROW_NEON
373void I422ToRAWRow_NEON(const uint8* src_y,
374                       const uint8* src_u,
375                       const uint8* src_v,
376                       uint8* dst_raw,
377                       int width) {
378  asm volatile (
379    YUV422TORGB_SETUP_REG
380  "1:                                          \n"
381    READYUV422
382    YUV422TORGB(v20, v21, v22)
383    "subs       %w4, %w4, #8                   \n"
384    MEMACCESS(3)
385    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
386    "b.gt       1b                             \n"
387    : "+r"(src_y),     // %0
388      "+r"(src_u),     // %1
389      "+r"(src_v),     // %2
390      "+r"(dst_raw),   // %3
391      "+r"(width)      // %4
392    : [kUVBiasBGR]"r"(&kUVBiasBGR),
393      [kYToRgb]"r"(&kYToRgb)
394    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
395      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
396  );
397}
398#endif  // HAS_I422TORAWROW_NEON
399
400#define ARGBTORGB565                                                           \
401    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
402    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
403    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
404    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
405    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
406
407#ifdef HAS_I422TORGB565ROW_NEON
408void I422ToRGB565Row_NEON(const uint8* src_y,
409                          const uint8* src_u,
410                          const uint8* src_v,
411                          uint8* dst_rgb565,
412                          int width) {
413  asm volatile (
414    YUV422TORGB_SETUP_REG
415  "1:                                          \n"
416    READYUV422
417    YUV422TORGB(v22, v21, v20)
418    "subs       %w4, %w4, #8                   \n"
419    ARGBTORGB565
420    MEMACCESS(3)
421    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
422    "b.gt       1b                             \n"
423    : "+r"(src_y),    // %0
424      "+r"(src_u),    // %1
425      "+r"(src_v),    // %2
426      "+r"(dst_rgb565),  // %3
427      "+r"(width)     // %4
428    : [kUVBiasBGR]"r"(&kUVBiasBGR),
429      [kYToRgb]"r"(&kYToRgb)
430    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
431      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
432  );
433}
434#endif  // HAS_I422TORGB565ROW_NEON
435
436#define ARGBTOARGB1555                                                         \
437    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
438    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
439    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
440    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
441    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
442    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
443    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
444
445#ifdef HAS_I422TOARGB1555ROW_NEON
446void I422ToARGB1555Row_NEON(const uint8* src_y,
447                            const uint8* src_u,
448                            const uint8* src_v,
449                            uint8* dst_argb1555,
450                            int width) {
451  asm volatile (
452    YUV422TORGB_SETUP_REG
453  "1:                                          \n"
454    READYUV422
455    YUV422TORGB(v22, v21, v20)
456    "subs       %w4, %w4, #8                   \n"
457    "movi       v23.8b, #255                   \n"
458    ARGBTOARGB1555
459    MEMACCESS(3)
460    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
461    "b.gt       1b                             \n"
462    : "+r"(src_y),    // %0
463      "+r"(src_u),    // %1
464      "+r"(src_v),    // %2
465      "+r"(dst_argb1555),  // %3
466      "+r"(width)     // %4
467    : [kUVBiasBGR]"r"(&kUVBiasBGR),
468      [kYToRgb]"r"(&kYToRgb)
469    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
470      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
471  );
472}
473#endif  // HAS_I422TOARGB1555ROW_NEON
474
475#define ARGBTOARGB4444                                                         \
476    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
477    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
478    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
479    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
480    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
481    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
482    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
483    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
484
485#ifdef HAS_I422TOARGB4444ROW_NEON
486void I422ToARGB4444Row_NEON(const uint8* src_y,
487                            const uint8* src_u,
488                            const uint8* src_v,
489                            uint8* dst_argb4444,
490                            int width) {
491  asm volatile (
492    YUV422TORGB_SETUP_REG
493    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
494  "1:                                          \n"
495    READYUV422
496    YUV422TORGB(v22, v21, v20)
497    "subs       %w4, %w4, #8                   \n"
498    "movi       v23.8b, #255                   \n"
499    ARGBTOARGB4444
500    MEMACCESS(3)
501    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
502    "b.gt       1b                             \n"
503    : "+r"(src_y),    // %0
504      "+r"(src_u),    // %1
505      "+r"(src_v),    // %2
506      "+r"(dst_argb4444),  // %3
507      "+r"(width)     // %4
508    : [kUVBiasBGR]"r"(&kUVBiasBGR),
509      [kYToRgb]"r"(&kYToRgb)
510    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
511      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
512  );
513}
514#endif  // HAS_I422TOARGB4444ROW_NEON
515
516#ifdef HAS_I400TOARGBROW_NEON
517void I400ToARGBRow_NEON(const uint8* src_y,
518                        uint8* dst_argb,
519                        int width) {
520  int64 width64 = (int64)(width);
521  asm volatile (
522    YUV422TORGB_SETUP_REG
523  "1:                                          \n"
524    READYUV400
525    YUV422TORGB(v22, v21, v20)
526    "subs       %w2, %w2, #8                   \n"
527    "movi       v23.8b, #255                   \n"
528    MEMACCESS(1)
529    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
530    "b.gt       1b                             \n"
531    : "+r"(src_y),     // %0
532      "+r"(dst_argb),  // %1
533      "+r"(width64)    // %2
534    : [kUVBiasBGR]"r"(&kUVBiasBGR),
535      [kYToRgb]"r"(&kYToRgb)
536    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
537      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
538  );
539}
540#endif  // HAS_I400TOARGBROW_NEON
541
542#ifdef HAS_J400TOARGBROW_NEON
543void J400ToARGBRow_NEON(const uint8* src_y,
544                        uint8* dst_argb,
545                        int width) {
546  asm volatile (
547    "movi       v23.8b, #255                   \n"
548  "1:                                          \n"
549    MEMACCESS(0)
550    "ld1        {v20.8b}, [%0], #8             \n"
551    "orr        v21.8b, v20.8b, v20.8b         \n"
552    "orr        v22.8b, v20.8b, v20.8b         \n"
553    "subs       %w2, %w2, #8                   \n"
554    MEMACCESS(1)
555    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
556    "b.gt       1b                             \n"
557    : "+r"(src_y),     // %0
558      "+r"(dst_argb),  // %1
559      "+r"(width)      // %2
560    :
561    : "cc", "memory", "v20", "v21", "v22", "v23"
562  );
563}
564#endif  // HAS_J400TOARGBROW_NEON
565
566#ifdef HAS_NV12TOARGBROW_NEON
567void NV12ToARGBRow_NEON(const uint8* src_y,
568                        const uint8* src_uv,
569                        uint8* dst_argb,
570                        int width) {
571  asm volatile (
572    YUV422TORGB_SETUP_REG
573  "1:                                          \n"
574    READNV12
575    YUV422TORGB(v22, v21, v20)
576    "subs       %w3, %w3, #8                   \n"
577    "movi       v23.8b, #255                   \n"
578    MEMACCESS(2)
579    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
580    "b.gt       1b                             \n"
581    : "+r"(src_y),     // %0
582      "+r"(src_uv),    // %1
583      "+r"(dst_argb),  // %2
584      "+r"(width)      // %3
585    : [kUVBiasBGR]"r"(&kUVBiasBGR),
586      [kYToRgb]"r"(&kYToRgb)
587    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
588      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
589  );
590}
591#endif  // HAS_NV12TOARGBROW_NEON
592
593#ifdef HAS_NV21TOARGBROW_NEON
594void NV21ToARGBRow_NEON(const uint8* src_y,
595                        const uint8* src_uv,
596                        uint8* dst_argb,
597                        int width) {
598  asm volatile (
599    YUV422TORGB_SETUP_REG
600  "1:                                          \n"
601    READNV21
602    YUV422TORGB(v22, v21, v20)
603    "subs       %w3, %w3, #8                   \n"
604    "movi       v23.8b, #255                   \n"
605    MEMACCESS(2)
606    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
607    "b.gt       1b                             \n"
608    : "+r"(src_y),     // %0
609      "+r"(src_uv),    // %1
610      "+r"(dst_argb),  // %2
611      "+r"(width)      // %3
612    : [kUVBiasBGR]"r"(&kUVBiasBGR),
613      [kYToRgb]"r"(&kYToRgb)
614    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
615      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
616  );
617}
618#endif  // HAS_NV21TOARGBROW_NEON
619
620#ifdef HAS_NV12TORGB565ROW_NEON
621void NV12ToRGB565Row_NEON(const uint8* src_y,
622                          const uint8* src_uv,
623                          uint8* dst_rgb565,
624                          int width) {
625  asm volatile (
626    YUV422TORGB_SETUP_REG
627  "1:                                          \n"
628    READNV12
629    YUV422TORGB(v22, v21, v20)
630    "subs       %w3, %w3, #8                   \n"
631    ARGBTORGB565
632    MEMACCESS(2)
633    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
634    "b.gt       1b                             \n"
635    : "+r"(src_y),     // %0
636      "+r"(src_uv),    // %1
637      "+r"(dst_rgb565),  // %2
638      "+r"(width)      // %3
639    : [kUVBiasBGR]"r"(&kUVBiasBGR),
640      [kYToRgb]"r"(&kYToRgb)
641    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
642      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
643  );
644}
645#endif  // HAS_NV12TORGB565ROW_NEON
646
647#ifdef HAS_NV21TORGB565ROW_NEON
648void NV21ToRGB565Row_NEON(const uint8* src_y,
649                          const uint8* src_uv,
650                          uint8* dst_rgb565,
651                          int width) {
652  asm volatile (
653    YUV422TORGB_SETUP_REG
654  "1:                                          \n"
655    READNV21
656    YUV422TORGB(v22, v21, v20)
657    "subs       %w3, %w3, #8                   \n"
658    ARGBTORGB565
659    MEMACCESS(2)
660    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
661    "b.gt       1b                             \n"
662    : "+r"(src_y),     // %0
663      "+r"(src_uv),    // %1
664      "+r"(dst_rgb565),  // %2
665      "+r"(width)      // %3
666    : [kUVBiasBGR]"r"(&kUVBiasBGR),
667      [kYToRgb]"r"(&kYToRgb)
668    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
669      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
670  );
671}
672#endif  // HAS_NV21TORGB565ROW_NEON
673
674#ifdef HAS_YUY2TOARGBROW_NEON
675void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
676                        uint8* dst_argb,
677                        int width) {
678  int64 width64 = (int64)(width);
679  asm volatile (
680    YUV422TORGB_SETUP_REG
681  "1:                                          \n"
682    READYUY2
683    YUV422TORGB(v22, v21, v20)
684    "subs       %w2, %w2, #8                   \n"
685    "movi       v23.8b, #255                   \n"
686    MEMACCESS(1)
687    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
688    "b.gt       1b                             \n"
689    : "+r"(src_yuy2),  // %0
690      "+r"(dst_argb),  // %1
691      "+r"(width64)    // %2
692    : [kUVBiasBGR]"r"(&kUVBiasBGR),
693      [kYToRgb]"r"(&kYToRgb)
694    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
695      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
696  );
697}
698#endif  // HAS_YUY2TOARGBROW_NEON
699
700#ifdef HAS_UYVYTOARGBROW_NEON
701void UYVYToARGBRow_NEON(const uint8* src_uyvy,
702                        uint8* dst_argb,
703                        int width) {
704  int64 width64 = (int64)(width);
705  asm volatile (
706    YUV422TORGB_SETUP_REG
707  "1:                                          \n"
708    READUYVY
709    YUV422TORGB(v22, v21, v20)
710    "subs       %w2, %w2, #8                   \n"
711    "movi       v23.8b, #255                   \n"
712    MEMACCESS(1)
713    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
714    "b.gt       1b                             \n"
715    : "+r"(src_uyvy),  // %0
716      "+r"(dst_argb),  // %1
717      "+r"(width64)    // %2
718    : [kUVBiasBGR]"r"(&kUVBiasBGR),
719      [kYToRgb]"r"(&kYToRgb)
720    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
721      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
722  );
723}
724#endif  // HAS_UYVYTOARGBROW_NEON
725
726// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
727#ifdef HAS_SPLITUVROW_NEON
728void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
729                     int width) {
730  asm volatile (
731  "1:                                          \n"
732    MEMACCESS(0)
733    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
734    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
735    MEMACCESS(1)
736    "st1        {v0.16b}, [%1], #16            \n"  // store U
737    MEMACCESS(2)
738    "st1        {v1.16b}, [%2], #16            \n"  // store V
739    "b.gt       1b                             \n"
740    : "+r"(src_uv),  // %0
741      "+r"(dst_u),   // %1
742      "+r"(dst_v),   // %2
743      "+r"(width)    // %3  // Output registers
744    :                       // Input registers
745    : "cc", "memory", "v0", "v1"  // Clobber List
746  );
747}
748#endif  // HAS_SPLITUVROW_NEON
749
750// Reads 16 U's and V's and writes out 16 pairs of UV.
751#ifdef HAS_MERGEUVROW_NEON
752void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
753                     int width) {
754  asm volatile (
755  "1:                                          \n"
756    MEMACCESS(0)
757    "ld1        {v0.16b}, [%0], #16            \n"  // load U
758    MEMACCESS(1)
759    "ld1        {v1.16b}, [%1], #16            \n"  // load V
760    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
761    MEMACCESS(2)
762    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
763    "b.gt       1b                             \n"
764    :
765      "+r"(src_u),   // %0
766      "+r"(src_v),   // %1
767      "+r"(dst_uv),  // %2
768      "+r"(width)    // %3  // Output registers
769    :                       // Input registers
770    : "cc", "memory", "v0", "v1"  // Clobber List
771  );
772}
773#endif  // HAS_MERGEUVROW_NEON
774
775// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
776#ifdef HAS_COPYROW_NEON
777void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
778  asm volatile (
779  "1:                                          \n"
780    MEMACCESS(0)
781    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
782    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
783    MEMACCESS(1)
784    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
785    "b.gt       1b                             \n"
786  : "+r"(src),   // %0
787    "+r"(dst),   // %1
788    "+r"(count)  // %2  // Output registers
789  :                     // Input registers
790  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
791  );
792}
793#endif  // HAS_COPYROW_NEON
794
795// SetRow writes 'count' bytes using an 8 bit value repeated.
796void SetRow_NEON(uint8* dst, uint8 v8, int count) {
797  asm volatile (
798    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
799  "1:                                          \n"
800    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
801    MEMACCESS(0)
802    "st1        {v0.16b}, [%0], #16            \n"  // store
803    "b.gt      1b                              \n"
804  : "+r"(dst),   // %0
805    "+r"(count)  // %1
806  : "r"(v8)      // %2
807  : "cc", "memory", "v0"
808  );
809}
810
811void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
812  asm volatile (
813    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
814  "1:                                          \n"
815    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
816    MEMACCESS(0)
817    "st1        {v0.16b}, [%0], #16            \n"  // store
818    "b.gt      1b                              \n"
819  : "+r"(dst),   // %0
820    "+r"(count)  // %1
821  : "r"(v32)     // %2
822  : "cc", "memory", "v0"
823  );
824}
825
826#ifdef HAS_MIRRORROW_NEON
827void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
828  int64 width64 = (int64) width;
829  asm volatile (
830    // Start at end of source row.
831    "add        %0, %0, %2                     \n"
832    "sub        %0, %0, #16                    \n"
833
834  "1:                                          \n"
835    MEMACCESS(0)
836    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
837    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
838    "rev64      v0.16b, v0.16b                 \n"
839    MEMACCESS(1)
840    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
841    MEMACCESS(1)
842    "st1        {v0.D}[0], [%1], #8            \n"
843    "b.gt       1b                             \n"
844  : "+r"(src),   // %0
845    "+r"(dst),   // %1
846    "+r"(width64)  // %2
847  : "r"((ptrdiff_t)-16)    // %3
848  : "cc", "memory", "v0"
849  );
850}
851#endif  // HAS_MIRRORROW_NEON
852
853#ifdef HAS_MIRRORUVROW_NEON
854void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
855                      int width) {
856  int64 width64 = (int64) width;
857  asm volatile (
858    // Start at end of source row.
859    "add        %0, %0, %3, lsl #1             \n"
860    "sub        %0, %0, #16                    \n"
861
862  "1:                                          \n"
863    MEMACCESS(0)
864    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
865    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
866    "rev64      v0.8b, v0.8b                   \n"
867    "rev64      v1.8b, v1.8b                   \n"
868    MEMACCESS(1)
869    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
870    MEMACCESS(2)
871    "st1        {v1.8b}, [%2], #8              \n"
872    "b.gt       1b                             \n"
873  : "+r"(src_uv),  // %0
874    "+r"(dst_u),   // %1
875    "+r"(dst_v),   // %2
876    "+r"(width64)    // %3
877  : "r"((ptrdiff_t)-16)      // %4
878  : "cc", "memory", "v0", "v1"
879  );
880}
881#endif  // HAS_MIRRORUVROW_NEON
882
883#ifdef HAS_ARGBMIRRORROW_NEON
884void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
885  int64 width64 = (int64) width;
886  asm volatile (
887    // Start at end of source row.
888    "add        %0, %0, %2, lsl #2             \n"
889    "sub        %0, %0, #16                    \n"
890
891  "1:                                          \n"
892    MEMACCESS(0)
893    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
894    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
895    "rev64      v0.4s, v0.4s                   \n"
896    MEMACCESS(1)
897    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
898    MEMACCESS(1)
899    "st1        {v0.D}[0], [%1], #8            \n"
900    "b.gt       1b                             \n"
901  : "+r"(src),   // %0
902    "+r"(dst),   // %1
903    "+r"(width64)  // %2
904  : "r"((ptrdiff_t)-16)    // %3
905  : "cc", "memory", "v0"
906  );
907}
908#endif  // HAS_ARGBMIRRORROW_NEON
909
910#ifdef HAS_RGB24TOARGBROW_NEON
911void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
912  asm volatile (
913    "movi       v4.8b, #255                    \n"  // Alpha
914  "1:                                          \n"
915    MEMACCESS(0)
916    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
917    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
918    MEMACCESS(1)
919    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
920    "b.gt       1b                             \n"
921  : "+r"(src_rgb24),  // %0
922    "+r"(dst_argb),   // %1
923    "+r"(pix)         // %2
924  :
925  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
926  );
927}
928#endif  // HAS_RGB24TOARGBROW_NEON
929
930#ifdef HAS_RAWTOARGBROW_NEON
931void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
932  asm volatile (
933    "movi       v5.8b, #255                    \n"  // Alpha
934  "1:                                          \n"
935    MEMACCESS(0)
936    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
937    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
938    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
939    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
940    MEMACCESS(1)
941    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
942    "b.gt       1b                             \n"
943  : "+r"(src_raw),   // %0
944    "+r"(dst_argb),  // %1
945    "+r"(pix)        // %2
946  :
947  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
948  );
949}
950#endif  // HAS_RAWTOARGBROW_NEON
951
952#define RGB565TOARGB                                                           \
953    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
954    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
955    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
956    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
957    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
958    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
959    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
960    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
961    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
962    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
963    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
964
965#ifdef HAS_RGB565TOARGBROW_NEON
966void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
967  asm volatile (
968    "movi       v3.8b, #255                    \n"  // Alpha
969  "1:                                          \n"
970    MEMACCESS(0)
971    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
972    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
973    RGB565TOARGB
974    MEMACCESS(1)
975    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
976    "b.gt       1b                             \n"
977  : "+r"(src_rgb565),  // %0
978    "+r"(dst_argb),    // %1
979    "+r"(pix)          // %2
980  :
981  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
982  );
983}
984#endif  // HAS_RGB565TOARGBROW_NEON
985
986#define ARGB1555TOARGB                                                         \
987    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
988    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
989    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
990                                                                               \
991    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
992    "xtn2       v3.16b, v2.8h                  \n"                             \
993                                                                               \
994    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
995    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
996                                                                               \
997    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
998    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
999    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
1000                                                                               \
1001    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
1002    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
1003    "dup        v1.2D, v0.D[1]                 \n"                             \
1004    "dup        v3.2D, v2.D[1]                 \n"
1005
1006// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1007#define RGB555TOARGB                                                           \
1008    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
1009    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
1010    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
1011                                                                               \
1012    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
1013    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
1014                                                                               \
1015    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
1016    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
1017    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
1018                                                                               \
1019    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
1020    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
1021    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
1022
1023#ifdef HAS_ARGB1555TOARGBROW_NEON
1024void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
1025                            int pix) {
1026  asm volatile (
1027    "movi       v3.8b, #255                    \n"  // Alpha
1028  "1:                                          \n"
1029    MEMACCESS(0)
1030    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1031    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1032    ARGB1555TOARGB
1033    MEMACCESS(1)
1034    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
1035    "b.gt       1b                             \n"
1036  : "+r"(src_argb1555),  // %0
1037    "+r"(dst_argb),    // %1
1038    "+r"(pix)          // %2
1039  :
1040  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1041  );
1042}
1043#endif  // HAS_ARGB1555TOARGBROW_NEON
1044
1045#define ARGB4444TOARGB                                                         \
1046    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
1047    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
1048    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
1049    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
1050    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
1051    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
1052    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
1053    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
1054    "dup        v0.2D, v2.D[1]                 \n"                             \
1055    "dup        v1.2D, v3.D[1]                 \n"
1056
1057#ifdef HAS_ARGB4444TOARGBROW_NEON
1058void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1059                            int pix) {
1060  asm volatile (
1061  "1:                                          \n"
1062    MEMACCESS(0)
1063    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1064    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1065    ARGB4444TOARGB
1066    MEMACCESS(1)
1067    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
1068    "b.gt       1b                             \n"
1069  : "+r"(src_argb4444),  // %0
1070    "+r"(dst_argb),    // %1
1071    "+r"(pix)          // %2
1072  :
1073  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
1074  );
1075}
1076#endif  // HAS_ARGB4444TOARGBROW_NEON
1077
1078#ifdef HAS_ARGBTORGB24ROW_NEON
1079void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
1080  asm volatile (
1081  "1:                                          \n"
1082    MEMACCESS(0)
1083    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
1084    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1085    MEMACCESS(1)
1086    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
1087    "b.gt       1b                             \n"
1088  : "+r"(src_argb),   // %0
1089    "+r"(dst_rgb24),  // %1
1090    "+r"(pix)         // %2
1091  :
1092  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
1093  );
1094}
1095#endif  // HAS_ARGBTORGB24ROW_NEON
1096
1097#ifdef HAS_ARGBTORAWROW_NEON
1098void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
1099  asm volatile (
1100  "1:                                          \n"
1101    MEMACCESS(0)
1102    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
1103    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1104    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
1105    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
1106    MEMACCESS(1)
1107    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
1108    "b.gt       1b                             \n"
1109  : "+r"(src_argb),  // %0
1110    "+r"(dst_raw),   // %1
1111    "+r"(pix)        // %2
1112  :
1113  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
1114  );
1115}
1116#endif  // HAS_ARGBTORAWROW_NEON
1117
1118#ifdef HAS_YUY2TOYROW_NEON
1119void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
1120  asm volatile (
1121  "1:                                          \n"
1122    MEMACCESS(0)
1123    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
1124    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
1125    MEMACCESS(1)
1126    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1127    "b.gt       1b                             \n"
1128  : "+r"(src_yuy2),  // %0
1129    "+r"(dst_y),     // %1
1130    "+r"(pix)        // %2
1131  :
1132  : "cc", "memory", "v0", "v1"  // Clobber List
1133  );
1134}
1135#endif  // HAS_YUY2TOYROW_NEON
1136
1137#ifdef HAS_UYVYTOYROW_NEON
1138void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
1139  asm volatile (
1140  "1:                                          \n"
1141    MEMACCESS(0)
1142    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
1143    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
1144    MEMACCESS(1)
1145    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1146    "b.gt       1b                             \n"
1147  : "+r"(src_uyvy),  // %0
1148    "+r"(dst_y),     // %1
1149    "+r"(pix)        // %2
1150  :
1151  : "cc", "memory", "v0", "v1"  // Clobber List
1152  );
1153}
1154#endif  // HAS_UYVYTOYROW_NEON
1155
1156#ifdef HAS_YUY2TOUV422ROW_NEON
1157void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1158                         int pix) {
1159  asm volatile (
1160  "1:                                          \n"
1161    MEMACCESS(0)
1162    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
1163    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1164    MEMACCESS(1)
1165    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
1166    MEMACCESS(2)
1167    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
1168    "b.gt       1b                             \n"
1169  : "+r"(src_yuy2),  // %0
1170    "+r"(dst_u),     // %1
1171    "+r"(dst_v),     // %2
1172    "+r"(pix)        // %3
1173  :
1174  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1175  );
1176}
1177#endif  // HAS_YUY2TOUV422ROW_NEON
1178
1179#ifdef HAS_UYVYTOUV422ROW_NEON
1180void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1181                         int pix) {
1182  asm volatile (
1183  "1:                                          \n"
1184    MEMACCESS(0)
1185    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
1186    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1187    MEMACCESS(1)
1188    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
1189    MEMACCESS(2)
1190    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
1191    "b.gt       1b                             \n"
1192  : "+r"(src_uyvy),  // %0
1193    "+r"(dst_u),     // %1
1194    "+r"(dst_v),     // %2
1195    "+r"(pix)        // %3
1196  :
1197  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1198  );
1199}
1200#endif  // HAS_UYVYTOUV422ROW_NEON
1201
1202#ifdef HAS_YUY2TOUVROW_NEON
1203void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1204                      uint8* dst_u, uint8* dst_v, int pix) {
1205  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1206  asm volatile (
1207  "1:                                          \n"
1208    MEMACCESS(0)
1209    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1210    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1211    MEMACCESS(1)
1212    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1213    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
1214    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
1215    MEMACCESS(2)
1216    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
1217    MEMACCESS(3)
1218    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
1219    "b.gt       1b                             \n"
1220  : "+r"(src_yuy2),     // %0
1221    "+r"(src_yuy2b),    // %1
1222    "+r"(dst_u),        // %2
1223    "+r"(dst_v),        // %3
1224    "+r"(pix)           // %4
1225  :
1226  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1227    "v5", "v6", "v7"  // Clobber List
1228  );
1229}
1230#endif  // HAS_YUY2TOUVROW_NEON
1231
1232#ifdef HAS_UYVYTOUVROW_NEON
1233void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1234                      uint8* dst_u, uint8* dst_v, int pix) {
1235  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1236  asm volatile (
1237  "1:                                          \n"
1238    MEMACCESS(0)
1239    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1240    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1241    MEMACCESS(1)
1242    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1243    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
1244    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
1245    MEMACCESS(2)
1246    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
1247    MEMACCESS(3)
1248    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
1249    "b.gt       1b                             \n"
1250  : "+r"(src_uyvy),     // %0
1251    "+r"(src_uyvyb),    // %1
1252    "+r"(dst_u),        // %2
1253    "+r"(dst_v),        // %3
1254    "+r"(pix)           // %4
1255  :
1256  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1257    "v5", "v6", "v7"  // Clobber List
1258  );
1259}
1260#endif  // HAS_UYVYTOUVROW_NEON
1261
1262// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1263#ifdef HAS_ARGBSHUFFLEROW_NEON
1264void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1265                         const uint8* shuffler, int pix) {
1266  asm volatile (
1267    MEMACCESS(3)
1268    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
1269  "1:                                          \n"
1270    MEMACCESS(0)
1271    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
1272    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
1273    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
1274    MEMACCESS(1)
1275    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
1276    "b.gt       1b                             \n"
1277  : "+r"(src_argb),  // %0
1278    "+r"(dst_argb),  // %1
1279    "+r"(pix)        // %2
1280  : "r"(shuffler)    // %3
1281  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1282  );
1283}
1284#endif  // HAS_ARGBSHUFFLEROW_NEON
1285
1286#ifdef HAS_I422TOYUY2ROW_NEON
1287void I422ToYUY2Row_NEON(const uint8* src_y,
1288                        const uint8* src_u,
1289                        const uint8* src_v,
1290                        uint8* dst_yuy2, int width) {
1291  asm volatile (
1292  "1:                                          \n"
1293    MEMACCESS(0)
1294    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
1295    "orr        v2.8b, v1.8b, v1.8b            \n"
1296    MEMACCESS(1)
1297    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
1298    MEMACCESS(2)
1299    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
1300    "subs       %w4, %w4, #16                  \n"  // 16 pixels
1301    MEMACCESS(3)
1302    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1303    "b.gt       1b                             \n"
1304  : "+r"(src_y),     // %0
1305    "+r"(src_u),     // %1
1306    "+r"(src_v),     // %2
1307    "+r"(dst_yuy2),  // %3
1308    "+r"(width)      // %4
1309  :
1310  : "cc", "memory", "v0", "v1", "v2", "v3"
1311  );
1312}
1313#endif  // HAS_I422TOYUY2ROW_NEON
1314
1315#ifdef HAS_I422TOUYVYROW_NEON
1316void I422ToUYVYRow_NEON(const uint8* src_y,
1317                        const uint8* src_u,
1318                        const uint8* src_v,
1319                        uint8* dst_uyvy, int width) {
1320  asm volatile (
1321  "1:                                          \n"
1322    MEMACCESS(0)
1323    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
1324    "orr        v3.8b, v2.8b, v2.8b            \n"
1325    MEMACCESS(1)
1326    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
1327    MEMACCESS(2)
1328    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
1329    "subs       %w4, %w4, #16                  \n"  // 16 pixels
1330    MEMACCESS(3)
1331    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1332    "b.gt       1b                             \n"
1333  : "+r"(src_y),     // %0
1334    "+r"(src_u),     // %1
1335    "+r"(src_v),     // %2
1336    "+r"(dst_uyvy),  // %3
1337    "+r"(width)      // %4
1338  :
1339  : "cc", "memory", "v0", "v1", "v2", "v3"
1340  );
1341}
1342#endif  // HAS_I422TOUYVYROW_NEON
1343
1344#ifdef HAS_ARGBTORGB565ROW_NEON
1345void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
1346  asm volatile (
1347  "1:                                          \n"
1348    MEMACCESS(0)
1349    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1350    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1351    ARGBTORGB565
1352    MEMACCESS(1)
1353    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
1354    "b.gt       1b                             \n"
1355  : "+r"(src_argb),  // %0
1356    "+r"(dst_rgb565),  // %1
1357    "+r"(pix)        // %2
1358  :
1359  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1360  );
1361}
1362#endif  // HAS_ARGBTORGB565ROW_NEON
1363
1364#ifdef HAS_ARGBTORGB565DITHERROW_NEON
1365void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1366                                const uint32 dither4, int width) {
1367  asm volatile (
1368    "dup        v1.4s, %w2                     \n"  // dither4
1369  "1:                                          \n"
1370    MEMACCESS(1)
1371    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
1372    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1373    "uqadd      v20.8b, v20.8b, v1.8b          \n"
1374    "uqadd      v21.8b, v21.8b, v1.8b          \n"
1375    "uqadd      v22.8b, v22.8b, v1.8b          \n"
1376    ARGBTORGB565
1377    MEMACCESS(0)
1378    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
1379    "b.gt       1b                             \n"
1380  : "+r"(dst_rgb)    // %0
1381  : "r"(src_argb),   // %1
1382    "r"(dither4),    // %2
1383    "r"(width)       // %3
1384  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1385  );
1386}
1387#endif  // HAS_ARGBTORGB565ROW_NEON
1388
1389#ifdef HAS_ARGBTOARGB1555ROW_NEON
1390void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1391                            int pix) {
1392  asm volatile (
1393  "1:                                          \n"
1394    MEMACCESS(0)
1395    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1396    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1397    ARGBTOARGB1555
1398    MEMACCESS(1)
1399    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
1400    "b.gt       1b                             \n"
1401  : "+r"(src_argb),  // %0
1402    "+r"(dst_argb1555),  // %1
1403    "+r"(pix)        // %2
1404  :
1405  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1406  );
1407}
1408#endif  // HAS_ARGBTOARGB1555ROW_NEON
1409
1410#ifdef HAS_ARGBTOARGB4444ROW_NEON
1411void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1412                            int pix) {
1413  asm volatile (
1414    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
1415  "1:                                          \n"
1416    MEMACCESS(0)
1417    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1418    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1419    ARGBTOARGB4444
1420    MEMACCESS(1)
1421    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
1422    "b.gt       1b                             \n"
1423  : "+r"(src_argb),      // %0
1424    "+r"(dst_argb4444),  // %1
1425    "+r"(pix)            // %2
1426  :
1427  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1428  );
1429}
1430#endif  // HAS_ARGBTOARGB4444ROW_NEON
1431
1432#ifdef HAS_ARGBTOYROW_NEON
1433void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1434  asm volatile (
1435    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1436    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1437    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1438    "movi       v7.8b, #16                     \n"  // Add 16 constant
1439  "1:                                          \n"
1440    MEMACCESS(0)
1441    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1442    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1443    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1444    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1445    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1446    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1447    "uqadd      v0.8b, v0.8b, v7.8b            \n"
1448    MEMACCESS(1)
1449    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1450    "b.gt       1b                             \n"
1451  : "+r"(src_argb),  // %0
1452    "+r"(dst_y),     // %1
1453    "+r"(pix)        // %2
1454  :
1455  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1456  );
1457}
1458#endif  // HAS_ARGBTOYROW_NEON
1459
1460#ifdef HAS_ARGBTOYJROW_NEON
1461void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1462  asm volatile (
1463    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
1464    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
1465    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
1466  "1:                                          \n"
1467    MEMACCESS(0)
1468    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1469    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1470    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1471    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1472    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1473    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
1474    MEMACCESS(1)
1475    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1476    "b.gt       1b                             \n"
1477  : "+r"(src_argb),  // %0
1478    "+r"(dst_y),     // %1
1479    "+r"(pix)        // %2
1480  :
1481  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1482  );
1483}
1484#endif  // HAS_ARGBTOYJROW_NEON
1485
1486// 8x1 pixels.
1487#ifdef HAS_ARGBTOUV444ROW_NEON
1488void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1489                         int pix) {
1490  asm volatile (
1491    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
1492    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
1493    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
1494    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
1495    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
1496    "movi       v29.16b,#0x80                  \n"  // 128.5
1497  "1:                                          \n"
1498    MEMACCESS(0)
1499    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1500    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1501    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
1502    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
1503    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
1504    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
1505
1506    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
1507    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
1508    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
1509    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
1510
1511    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
1512    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1513
1514    MEMACCESS(1)
1515    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1516    MEMACCESS(2)
1517    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1518    "b.gt       1b                             \n"
1519  : "+r"(src_argb),  // %0
1520    "+r"(dst_u),     // %1
1521    "+r"(dst_v),     // %2
1522    "+r"(pix)        // %3
1523  :
1524  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1525    "v24", "v25", "v26", "v27", "v28", "v29"
1526  );
1527}
1528#endif  // HAS_ARGBTOUV444ROW_NEON
1529
1530// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1531#ifdef HAS_ARGBTOUV422ROW_NEON
1532void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1533                         int pix) {
1534  asm volatile (
1535    RGBTOUV_SETUP_REG
1536  "1:                                          \n"
1537    MEMACCESS(0)
1538    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1539
1540    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1541    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1542    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1543
1544    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
1545    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
1546    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
1547    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
1548    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1549
1550    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
1551    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
1552    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
1553    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
1554
1555    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
1556    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
1557
1558    MEMACCESS(1)
1559    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1560    MEMACCESS(2)
1561    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1562    "b.gt       1b                             \n"
1563  : "+r"(src_argb),  // %0
1564    "+r"(dst_u),     // %1
1565    "+r"(dst_v),     // %2
1566    "+r"(pix)        // %3
1567  :
1568  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1569    "v20", "v21", "v22", "v23", "v24", "v25"
1570  );
1571}
1572#endif  // HAS_ARGBTOUV422ROW_NEON
1573
1574// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
1575#ifdef HAS_ARGBTOUV411ROW_NEON
1576void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1577                         int pix) {
1578  asm volatile (
1579    RGBTOUV_SETUP_REG
1580  "1:                                          \n"
1581    MEMACCESS(0)
1582    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1583    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1584    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1585    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1586    MEMACCESS(0)
1587    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
1588    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1589    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1590    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1591
1592    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
1593    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
1594    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
1595
1596    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1597    "urshr      v1.8h, v1.8h, #1               \n"
1598    "urshr      v2.8h, v2.8h, #1               \n"
1599
1600    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
1601    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
1602    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
1603    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
1604    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1605    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
1606    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
1607    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
1608    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
1609    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
1610    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
1611    MEMACCESS(1)
1612    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1613    MEMACCESS(2)
1614    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1615    "b.gt       1b                             \n"
1616  : "+r"(src_argb),  // %0
1617    "+r"(dst_u),     // %1
1618    "+r"(dst_v),     // %2
1619    "+r"(pix)        // %3
1620  :
1621  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1622    "v20", "v21", "v22", "v23", "v24", "v25"
1623  );
1624}
1625#endif  // HAS_ARGBTOUV411ROW_NEON
1626
1627// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1628#define RGBTOUV(QB, QG, QR) \
1629    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
1630    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
1631    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
1632    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
1633    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
1634    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
1635    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
1636    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
1637    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
1638    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
1639
1640// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1641// TODO(fbarchard): consider ptrdiff_t for all strides.
1642
1643#ifdef HAS_ARGBTOUVROW_NEON
1644void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1645                      uint8* dst_u, uint8* dst_v, int pix) {
1646  const uint8* src_argb_1 = src_argb + src_stride_argb;
1647  asm volatile (
1648    RGBTOUV_SETUP_REG
1649  "1:                                          \n"
1650    MEMACCESS(0)
1651    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1652    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1653    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1654    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1655
1656    MEMACCESS(1)
1657    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1658    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1659    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1660    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1661
1662    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1663    "urshr      v1.8h, v1.8h, #1               \n"
1664    "urshr      v2.8h, v2.8h, #1               \n"
1665
1666    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1667    RGBTOUV(v0.8h, v1.8h, v2.8h)
1668    MEMACCESS(2)
1669    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1670    MEMACCESS(3)
1671    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1672    "b.gt       1b                             \n"
1673  : "+r"(src_argb),  // %0
1674    "+r"(src_argb_1),  // %1
1675    "+r"(dst_u),     // %2
1676    "+r"(dst_v),     // %3
1677    "+r"(pix)        // %4
1678  :
1679  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1680    "v20", "v21", "v22", "v23", "v24", "v25"
1681  );
1682}
1683#endif  // HAS_ARGBTOUVROW_NEON
1684
1685// TODO(fbarchard): Subsample match C code.
1686#ifdef HAS_ARGBTOUVJROW_NEON
1687void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1688                       uint8* dst_u, uint8* dst_v, int pix) {
1689  const uint8* src_argb_1 = src_argb + src_stride_argb;
1690  asm volatile (
1691    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
1692    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
1693    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
1694    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
1695    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
1696    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1697  "1:                                          \n"
1698    MEMACCESS(0)
1699    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1700    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1701    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1702    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1703    MEMACCESS(1)
1704    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
1705    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1706    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1707    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1708
1709    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1710    "urshr      v1.8h, v1.8h, #1               \n"
1711    "urshr      v2.8h, v2.8h, #1               \n"
1712
1713    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1714    RGBTOUV(v0.8h, v1.8h, v2.8h)
1715    MEMACCESS(2)
1716    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1717    MEMACCESS(3)
1718    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1719    "b.gt       1b                             \n"
1720  : "+r"(src_argb),  // %0
1721    "+r"(src_argb_1),  // %1
1722    "+r"(dst_u),     // %2
1723    "+r"(dst_v),     // %3
1724    "+r"(pix)        // %4
1725  :
1726  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1727    "v20", "v21", "v22", "v23", "v24", "v25"
1728  );
1729}
1730#endif  // HAS_ARGBTOUVJROW_NEON
1731
1732#ifdef HAS_BGRATOUVROW_NEON
1733void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1734                      uint8* dst_u, uint8* dst_v, int pix) {
1735  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1736  asm volatile (
1737    RGBTOUV_SETUP_REG
1738  "1:                                          \n"
1739    MEMACCESS(0)
1740    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1741    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
1742    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1743    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
1744    MEMACCESS(1)
1745    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
1746    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
1747    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1748    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
1749
1750    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1751    "urshr      v1.8h, v3.8h, #1               \n"
1752    "urshr      v2.8h, v2.8h, #1               \n"
1753
1754    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1755    RGBTOUV(v0.8h, v1.8h, v2.8h)
1756    MEMACCESS(2)
1757    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1758    MEMACCESS(3)
1759    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1760    "b.gt       1b                             \n"
1761  : "+r"(src_bgra),  // %0
1762    "+r"(src_bgra_1),  // %1
1763    "+r"(dst_u),     // %2
1764    "+r"(dst_v),     // %3
1765    "+r"(pix)        // %4
1766  :
1767  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1768    "v20", "v21", "v22", "v23", "v24", "v25"
1769  );
1770}
1771#endif  // HAS_BGRATOUVROW_NEON
1772
1773#ifdef HAS_ABGRTOUVROW_NEON
1774void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1775                      uint8* dst_u, uint8* dst_v, int pix) {
1776  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1777  asm volatile (
1778    RGBTOUV_SETUP_REG
1779  "1:                                          \n"
1780    MEMACCESS(0)
1781    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1782    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1783    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1784    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1785    MEMACCESS(1)
1786    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1787    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1788    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1789    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1790
1791    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
1792    "urshr      v2.8h, v2.8h, #1               \n"
1793    "urshr      v1.8h, v1.8h, #1               \n"
1794
1795    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1796    RGBTOUV(v0.8h, v2.8h, v1.8h)
1797    MEMACCESS(2)
1798    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1799    MEMACCESS(3)
1800    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1801    "b.gt       1b                             \n"
1802  : "+r"(src_abgr),  // %0
1803    "+r"(src_abgr_1),  // %1
1804    "+r"(dst_u),     // %2
1805    "+r"(dst_v),     // %3
1806    "+r"(pix)        // %4
1807  :
1808  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1809    "v20", "v21", "v22", "v23", "v24", "v25"
1810  );
1811}
1812#endif  // HAS_ABGRTOUVROW_NEON
1813
1814#ifdef HAS_RGBATOUVROW_NEON
1815void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1816                      uint8* dst_u, uint8* dst_v, int pix) {
1817  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1818  asm volatile (
1819    RGBTOUV_SETUP_REG
1820  "1:                                          \n"
1821    MEMACCESS(0)
1822    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1823    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
1824    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1825    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
1826    MEMACCESS(1)
1827    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1828    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
1829    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1830    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
1831
1832    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1833    "urshr      v1.8h, v1.8h, #1               \n"
1834    "urshr      v2.8h, v2.8h, #1               \n"
1835
1836    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1837    RGBTOUV(v0.8h, v1.8h, v2.8h)
1838    MEMACCESS(2)
1839    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1840    MEMACCESS(3)
1841    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1842    "b.gt       1b                             \n"
1843  : "+r"(src_rgba),  // %0
1844    "+r"(src_rgba_1),  // %1
1845    "+r"(dst_u),     // %2
1846    "+r"(dst_v),     // %3
1847    "+r"(pix)        // %4
1848  :
1849  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1850    "v20", "v21", "v22", "v23", "v24", "v25"
1851  );
1852}
1853#endif  // HAS_RGBATOUVROW_NEON
1854
1855#ifdef HAS_RGB24TOUVROW_NEON
1856void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1857                       uint8* dst_u, uint8* dst_v, int pix) {
1858  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1859  asm volatile (
1860    RGBTOUV_SETUP_REG
1861  "1:                                          \n"
1862    MEMACCESS(0)
1863    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
1864    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1865    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1866    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1867    MEMACCESS(1)
1868    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
1869    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1870    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1871    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1872
1873    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1874    "urshr      v1.8h, v1.8h, #1               \n"
1875    "urshr      v2.8h, v2.8h, #1               \n"
1876
1877    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1878    RGBTOUV(v0.8h, v1.8h, v2.8h)
1879    MEMACCESS(2)
1880    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1881    MEMACCESS(3)
1882    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1883    "b.gt       1b                             \n"
1884  : "+r"(src_rgb24),  // %0
1885    "+r"(src_rgb24_1),  // %1
1886    "+r"(dst_u),     // %2
1887    "+r"(dst_v),     // %3
1888    "+r"(pix)        // %4
1889  :
1890  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1891    "v20", "v21", "v22", "v23", "v24", "v25"
1892  );
1893}
1894#endif  // HAS_RGB24TOUVROW_NEON
1895
1896#ifdef HAS_RAWTOUVROW_NEON
1897void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1898                     uint8* dst_u, uint8* dst_v, int pix) {
1899  const uint8* src_raw_1 = src_raw + src_stride_raw;
1900  asm volatile (
1901    RGBTOUV_SETUP_REG
1902  "1:                                          \n"
1903    MEMACCESS(0)
1904    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
1905    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1906    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1907    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1908    MEMACCESS(1)
1909    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
1910    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1911    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1912    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1913
1914    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
1915    "urshr      v1.8h, v1.8h, #1               \n"
1916    "urshr      v0.8h, v0.8h, #1               \n"
1917
1918    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1919    RGBTOUV(v2.8h, v1.8h, v0.8h)
1920    MEMACCESS(2)
1921    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1922    MEMACCESS(3)
1923    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1924    "b.gt       1b                             \n"
1925  : "+r"(src_raw),  // %0
1926    "+r"(src_raw_1),  // %1
1927    "+r"(dst_u),     // %2
1928    "+r"(dst_v),     // %3
1929    "+r"(pix)        // %4
1930  :
1931  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1932    "v20", "v21", "v22", "v23", "v24", "v25"
1933  );
1934}
1935#endif  // HAS_RAWTOUVROW_NEON
1936
1937// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1938#ifdef HAS_RGB565TOUVROW_NEON
1939void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1940                        uint8* dst_u, uint8* dst_v, int pix) {
1941  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1942  asm volatile (
1943    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
1944    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
1945    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
1946    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
1947    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
1948    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1949  "1:                                          \n"
1950    MEMACCESS(0)
1951    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1952    RGB565TOARGB
1953    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1954    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1955    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1956    MEMACCESS(0)
1957    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
1958    RGB565TOARGB
1959    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1960    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1961    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1962
1963    MEMACCESS(1)
1964    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
1965    RGB565TOARGB
1966    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1967    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1968    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1969    MEMACCESS(1)
1970    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
1971    RGB565TOARGB
1972    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1973    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1974    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1975
1976    "ins        v16.D[1], v17.D[0]             \n"
1977    "ins        v18.D[1], v19.D[0]             \n"
1978    "ins        v20.D[1], v21.D[0]             \n"
1979
1980    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1981    "urshr      v5.8h, v18.8h, #1              \n"
1982    "urshr      v6.8h, v20.8h, #1              \n"
1983
1984    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1985    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
1986    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
1987    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
1988    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
1989    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
1990    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
1991    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
1992    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
1993    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
1994    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
1995    MEMACCESS(2)
1996    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1997    MEMACCESS(3)
1998    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1999    "b.gt       1b                             \n"
2000  : "+r"(src_rgb565),  // %0
2001    "+r"(src_rgb565_1),  // %1
2002    "+r"(dst_u),     // %2
2003    "+r"(dst_v),     // %3
2004    "+r"(pix)        // %4
2005  :
2006  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2007    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
2008    "v25", "v26", "v27"
2009  );
2010}
2011#endif  // HAS_RGB565TOUVROW_NEON
2012
2013// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2014#ifdef HAS_ARGB1555TOUVROW_NEON
2015void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
2016                        uint8* dst_u, uint8* dst_v, int pix) {
2017  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
2018  asm volatile (
2019    RGBTOUV_SETUP_REG
2020  "1:                                          \n"
2021    MEMACCESS(0)
2022    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
2023    RGB555TOARGB
2024    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2025    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2026    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2027    MEMACCESS(0)
2028    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
2029    RGB555TOARGB
2030    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2031    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2032    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2033
2034    MEMACCESS(1)
2035    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
2036    RGB555TOARGB
2037    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2038    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2039    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2040    MEMACCESS(1)
2041    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
2042    RGB555TOARGB
2043    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2044    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2045    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2046
2047    "ins        v16.D[1], v26.D[0]             \n"
2048    "ins        v17.D[1], v27.D[0]             \n"
2049    "ins        v18.D[1], v28.D[0]             \n"
2050
2051    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
2052    "urshr      v5.8h, v17.8h, #1              \n"
2053    "urshr      v6.8h, v18.8h, #1              \n"
2054
2055    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
2056    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
2057    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
2058    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
2059    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
2060    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
2061    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
2062    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
2063    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
2064    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
2065    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
2066    MEMACCESS(2)
2067    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
2068    MEMACCESS(3)
2069    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
2070    "b.gt       1b                             \n"
2071  : "+r"(src_argb1555),  // %0
2072    "+r"(src_argb1555_1),  // %1
2073    "+r"(dst_u),     // %2
2074    "+r"(dst_v),     // %3
2075    "+r"(pix)        // %4
2076  :
2077  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2078    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2079    "v26", "v27", "v28"
2080  );
2081}
2082#endif  // HAS_ARGB1555TOUVROW_NEON
2083
2084// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2085#ifdef HAS_ARGB4444TOUVROW_NEON
2086void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
2087                          uint8* dst_u, uint8* dst_v, int pix) {
2088  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
2089  asm volatile (
2090    RGBTOUV_SETUP_REG
2091  "1:                                          \n"
2092    MEMACCESS(0)
2093    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
2094    ARGB4444TOARGB
2095    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2096    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2097    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2098    MEMACCESS(0)
2099    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
2100    ARGB4444TOARGB
2101    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2102    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2103    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2104
2105    MEMACCESS(1)
2106    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
2107    ARGB4444TOARGB
2108    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2109    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2110    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2111    MEMACCESS(1)
2112    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
2113    ARGB4444TOARGB
2114    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2115    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2116    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2117
2118    "ins        v16.D[1], v26.D[0]             \n"
2119    "ins        v17.D[1], v27.D[0]             \n"
2120    "ins        v18.D[1], v28.D[0]             \n"
2121
2122    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
2123    "urshr      v5.8h, v17.8h, #1              \n"
2124    "urshr      v6.8h, v18.8h, #1              \n"
2125
2126    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
2127    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
2128    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
2129    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
2130    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
2131    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
2132    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
2133    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
2134    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
2135    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
2136    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
2137    MEMACCESS(2)
2138    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
2139    MEMACCESS(3)
2140    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
2141    "b.gt       1b                             \n"
2142  : "+r"(src_argb4444),  // %0
2143    "+r"(src_argb4444_1),  // %1
2144    "+r"(dst_u),     // %2
2145    "+r"(dst_v),     // %3
2146    "+r"(pix)        // %4
2147  :
2148  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2149    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2150    "v26", "v27", "v28"
2151
2152  );
2153}
2154#endif  // HAS_ARGB4444TOUVROW_NEON
2155
2156#ifdef HAS_RGB565TOYROW_NEON
2157void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
2158  asm volatile (
2159    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
2160    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
2161    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
2162    "movi       v27.8b, #16                    \n"  // Add 16 constant
2163  "1:                                          \n"
2164    MEMACCESS(0)
2165    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
2166    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2167    RGB565TOARGB
2168    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
2169    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
2170    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
2171    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2172    "uqadd      v0.8b, v0.8b, v27.8b           \n"
2173    MEMACCESS(1)
2174    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2175    "b.gt       1b                             \n"
2176  : "+r"(src_rgb565),  // %0
2177    "+r"(dst_y),       // %1
2178    "+r"(pix)          // %2
2179  :
2180  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
2181    "v24", "v25", "v26", "v27"
2182  );
2183}
2184#endif  // HAS_RGB565TOYROW_NEON
2185
2186#ifdef HAS_ARGB1555TOYROW_NEON
2187void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
2188  asm volatile (
2189    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2190    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2191    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2192    "movi       v7.8b, #16                     \n"  // Add 16 constant
2193  "1:                                          \n"
2194    MEMACCESS(0)
2195    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
2196    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2197    ARGB1555TOARGB
2198    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
2199    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
2200    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
2201    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2202    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2203    MEMACCESS(1)
2204    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2205    "b.gt       1b                             \n"
2206  : "+r"(src_argb1555),  // %0
2207    "+r"(dst_y),         // %1
2208    "+r"(pix)            // %2
2209  :
2210  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2211  );
2212}
2213#endif  // HAS_ARGB1555TOYROW_NEON
2214
2215#ifdef HAS_ARGB4444TOYROW_NEON
2216void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
2217  asm volatile (
2218    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
2219    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
2220    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
2221    "movi       v27.8b, #16                    \n"  // Add 16 constant
2222  "1:                                          \n"
2223    MEMACCESS(0)
2224    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
2225    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2226    ARGB4444TOARGB
2227    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
2228    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
2229    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
2230    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2231    "uqadd      v0.8b, v0.8b, v27.8b           \n"
2232    MEMACCESS(1)
2233    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2234    "b.gt       1b                             \n"
2235  : "+r"(src_argb4444),  // %0
2236    "+r"(dst_y),         // %1
2237    "+r"(pix)            // %2
2238  :
2239  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2240  );
2241}
2242#endif  // HAS_ARGB4444TOYROW_NEON
2243
2244#ifdef HAS_BGRATOYROW_NEON
2245void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
2246  asm volatile (
2247    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2248    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2249    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2250    "movi       v7.8b, #16                     \n"  // Add 16 constant
2251  "1:                                          \n"
2252    MEMACCESS(0)
2253    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2254    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2255    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
2256    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2257    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
2258    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2259    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2260    MEMACCESS(1)
2261    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2262    "b.gt       1b                             \n"
2263  : "+r"(src_bgra),  // %0
2264    "+r"(dst_y),     // %1
2265    "+r"(pix)        // %2
2266  :
2267  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2268  );
2269}
2270#endif  // HAS_BGRATOYROW_NEON
2271
2272#ifdef HAS_ABGRTOYROW_NEON
2273void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
2274  asm volatile (
2275    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2276    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2277    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2278    "movi       v7.8b, #16                     \n"  // Add 16 constant
2279  "1:                                          \n"
2280    MEMACCESS(0)
2281    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2282    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2283    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
2284    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2285    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
2286    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2287    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2288    MEMACCESS(1)
2289    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2290    "b.gt       1b                             \n"
2291  : "+r"(src_abgr),  // %0
2292    "+r"(dst_y),     // %1
2293    "+r"(pix)        // %2
2294  :
2295  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2296  );
2297}
2298#endif  // HAS_ABGRTOYROW_NEON
2299
2300#ifdef HAS_RGBATOYROW_NEON
2301void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
2302  asm volatile (
2303    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2304    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2305    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2306    "movi       v7.8b, #16                     \n"  // Add 16 constant
2307  "1:                                          \n"
2308    MEMACCESS(0)
2309    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2310    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2311    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
2312    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2313    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
2314    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2315    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2316    MEMACCESS(1)
2317    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2318    "b.gt       1b                             \n"
2319  : "+r"(src_rgba),  // %0
2320    "+r"(dst_y),     // %1
2321    "+r"(pix)        // %2
2322  :
2323  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2324  );
2325}
2326#endif  // HAS_RGBATOYROW_NEON
2327
2328#ifdef HAS_RGB24TOYROW_NEON
2329void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
2330  asm volatile (
2331    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2332    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2333    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2334    "movi       v7.8b, #16                     \n"  // Add 16 constant
2335  "1:                                          \n"
2336    MEMACCESS(0)
2337    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2338    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2339    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2340    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2341    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2342    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2343    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2344    MEMACCESS(1)
2345    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2346    "b.gt       1b                             \n"
2347  : "+r"(src_rgb24),  // %0
2348    "+r"(dst_y),      // %1
2349    "+r"(pix)         // %2
2350  :
2351  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2352  );
2353}
2354#endif  // HAS_RGB24TOYROW_NEON
2355
2356#ifdef HAS_RAWTOYROW_NEON
2357void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
2358  asm volatile (
2359    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2360    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2361    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2362    "movi       v7.8b, #16                     \n"  // Add 16 constant
2363  "1:                                          \n"
2364    MEMACCESS(0)
2365    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2366    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2367    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2368    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2369    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2370    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2371    "uqadd      v0.8b, v0.8b, v7.8b            \n"
2372    MEMACCESS(1)
2373    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2374    "b.gt       1b                             \n"
2375  : "+r"(src_raw),  // %0
2376    "+r"(dst_y),    // %1
2377    "+r"(pix)       // %2
2378  :
2379  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2380  );
2381}
2382#endif  // HAS_RAWTOYROW_NEON
2383
2384// Bilinear filter 16x2 -> 16x1
2385#ifdef HAS_INTERPOLATEROW_NEON
2386void InterpolateRow_NEON(uint8* dst_ptr,
2387                         const uint8* src_ptr, ptrdiff_t src_stride,
2388                         int dst_width, int source_y_fraction) {
2389  int y1_fraction = source_y_fraction;
2390  int y0_fraction = 256 - y1_fraction;
2391  const uint8* src_ptr1 = src_ptr + src_stride;
2392  asm volatile (
2393    "cmp        %w4, #0                        \n"
2394    "b.eq       100f                           \n"
2395    "cmp        %w4, #64                       \n"
2396    "b.eq       75f                            \n"
2397    "cmp        %w4, #128                      \n"
2398    "b.eq       50f                            \n"
2399    "cmp        %w4, #192                      \n"
2400    "b.eq       25f                            \n"
2401
2402    "dup        v5.16b, %w4                    \n"
2403    "dup        v4.16b, %w5                    \n"
2404    // General purpose row blend.
2405  "1:                                          \n"
2406    MEMACCESS(1)
2407    "ld1        {v0.16b}, [%1], #16            \n"
2408    MEMACCESS(2)
2409    "ld1        {v1.16b}, [%2], #16            \n"
2410    "subs       %w3, %w3, #16                  \n"
2411    "umull      v2.8h, v0.8b,  v4.8b           \n"
2412    "umull2     v3.8h, v0.16b, v4.16b          \n"
2413    "umlal      v2.8h, v1.8b,  v5.8b           \n"
2414    "umlal2     v3.8h, v1.16b, v5.16b          \n"
2415    "rshrn      v0.8b,  v2.8h, #8              \n"
2416    "rshrn2     v0.16b, v3.8h, #8              \n"
2417    MEMACCESS(0)
2418    "st1        {v0.16b}, [%0], #16            \n"
2419    "b.gt       1b                             \n"
2420    "b          99f                            \n"
2421
2422    // Blend 25 / 75.
2423  "25:                                         \n"
2424    MEMACCESS(1)
2425    "ld1        {v0.16b}, [%1], #16            \n"
2426    MEMACCESS(2)
2427    "ld1        {v1.16b}, [%2], #16            \n"
2428    "subs       %w3, %w3, #16                  \n"
2429    "urhadd     v0.16b, v0.16b, v1.16b         \n"
2430    "urhadd     v0.16b, v0.16b, v1.16b         \n"
2431    MEMACCESS(0)
2432    "st1        {v0.16b}, [%0], #16            \n"
2433    "b.gt       25b                            \n"
2434    "b          99f                            \n"
2435
2436    // Blend 50 / 50.
2437  "50:                                         \n"
2438    MEMACCESS(1)
2439    "ld1        {v0.16b}, [%1], #16            \n"
2440    MEMACCESS(2)
2441    "ld1        {v1.16b}, [%2], #16            \n"
2442    "subs       %w3, %w3, #16                  \n"
2443    "urhadd     v0.16b, v0.16b, v1.16b         \n"
2444    MEMACCESS(0)
2445    "st1        {v0.16b}, [%0], #16            \n"
2446    "b.gt       50b                            \n"
2447    "b          99f                            \n"
2448
2449    // Blend 75 / 25.
2450  "75:                                         \n"
2451    MEMACCESS(1)
2452    "ld1        {v1.16b}, [%1], #16            \n"
2453    MEMACCESS(2)
2454    "ld1        {v0.16b}, [%2], #16            \n"
2455    "subs       %w3, %w3, #16                  \n"
2456    "urhadd     v0.16b, v0.16b, v1.16b         \n"
2457    "urhadd     v0.16b, v0.16b, v1.16b         \n"
2458    MEMACCESS(0)
2459    "st1        {v0.16b}, [%0], #16            \n"
2460    "b.gt       75b                            \n"
2461    "b          99f                            \n"
2462
2463    // Blend 100 / 0 - Copy row unchanged.
2464  "100:                                        \n"
2465    MEMACCESS(1)
2466    "ld1        {v0.16b}, [%1], #16            \n"
2467    "subs       %w3, %w3, #16                  \n"
2468    MEMACCESS(0)
2469    "st1        {v0.16b}, [%0], #16            \n"
2470    "b.gt       100b                           \n"
2471
2472  "99:                                         \n"
2473  : "+r"(dst_ptr),          // %0
2474    "+r"(src_ptr),          // %1
2475    "+r"(src_ptr1),         // %2
2476    "+r"(dst_width),        // %3
2477    "+r"(y1_fraction),      // %4
2478    "+r"(y0_fraction)       // %5
2479  :
2480  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2481  );
2482}
2483#endif  // HAS_INTERPOLATEROW_NEON
2484
2485// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2486#ifdef HAS_ARGBBLENDROW_NEON
2487void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2488                       uint8* dst_argb, int width) {
2489  asm volatile (
2490    "subs       %w3, %w3, #8                   \n"
2491    "b.lt       89f                            \n"
2492    // Blend 8 pixels.
2493  "8:                                          \n"
2494    MEMACCESS(0)
2495    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
2496    MEMACCESS(1)
2497    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
2498    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2499    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2500    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2501    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2502    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2503    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2504    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2505    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2506    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2507    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2508    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2509    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2510    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2511    "movi       v3.8b, #255                    \n"  // a = 255
2512    MEMACCESS(2)
2513    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2514    "b.ge       8b                             \n"
2515
2516  "89:                                         \n"
2517    "adds       %w3, %w3, #8-1                 \n"
2518    "b.lt       99f                            \n"
2519
2520    // Blend 1 pixels.
2521  "1:                                          \n"
2522    MEMACCESS(0)
2523    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
2524    MEMACCESS(1)
2525    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
2526    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
2527    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2528    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2529    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2530    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2531    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2532    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2533    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2534    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2535    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2536    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2537    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2538    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2539    "movi       v3.8b, #255                    \n"  // a = 255
2540    MEMACCESS(2)
2541    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
2542    "b.ge       1b                             \n"
2543
2544  "99:                                         \n"
2545
2546  : "+r"(src_argb0),    // %0
2547    "+r"(src_argb1),    // %1
2548    "+r"(dst_argb),     // %2
2549    "+r"(width)         // %3
2550  :
2551  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2552    "v16", "v17", "v18"
2553  );
2554}
2555#endif  // HAS_ARGBBLENDROW_NEON
2556
2557// Attenuate 8 pixels at a time.
2558#ifdef HAS_ARGBATTENUATEROW_NEON
2559void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2560  asm volatile (
2561    // Attenuate 8 pixels.
2562  "1:                                          \n"
2563    MEMACCESS(0)
2564    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
2565    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2566    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
2567    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
2568    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
2569    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
2570    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
2571    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
2572    MEMACCESS(1)
2573    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2574    "b.gt       1b                             \n"
2575  : "+r"(src_argb),   // %0
2576    "+r"(dst_argb),   // %1
2577    "+r"(width)       // %2
2578  :
2579  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2580  );
2581}
2582#endif  // HAS_ARGBATTENUATEROW_NEON
2583
2584// Quantize 8 ARGB pixels (32 bytes).
2585// dst = (dst * scale >> 16) * interval_size + interval_offset;
2586#ifdef HAS_ARGBQUANTIZEROW_NEON
2587void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2588                          int interval_offset, int width) {
2589  asm volatile (
2590    "dup        v4.8h, %w2                     \n"
2591    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
2592    "dup        v5.8h, %w3                     \n"  // interval multiply.
2593    "dup        v6.8h, %w4                     \n"  // interval add
2594
2595    // 8 pixel loop.
2596  "1:                                          \n"
2597    MEMACCESS(0)
2598    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
2599    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2600    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
2601    "uxtl       v1.8h, v1.8b                   \n"
2602    "uxtl       v2.8h, v2.8b                   \n"
2603    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
2604    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
2605    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
2606    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
2607    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
2608    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
2609    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
2610    "add        v1.8h, v1.8h, v6.8h            \n"  // g
2611    "add        v2.8h, v2.8h, v6.8h            \n"  // r
2612    "uqxtn      v0.8b, v0.8h                   \n"
2613    "uqxtn      v1.8b, v1.8h                   \n"
2614    "uqxtn      v2.8b, v2.8h                   \n"
2615    MEMACCESS(0)
2616    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
2617    "b.gt       1b                             \n"
2618  : "+r"(dst_argb),       // %0
2619    "+r"(width)           // %1
2620  : "r"(scale),           // %2
2621    "r"(interval_size),   // %3
2622    "r"(interval_offset)  // %4
2623  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2624  );
2625}
2626#endif  // HAS_ARGBQUANTIZEROW_NEON
2627
2628// Shade 8 pixels at a time by specified value.
2629// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2630// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2631#ifdef HAS_ARGBSHADEROW_NEON
2632void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2633                       uint32 value) {
2634  asm volatile (
2635    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
2636    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
2637    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
2638
2639    // 8 pixel loop.
2640  "1:                                          \n"
2641    MEMACCESS(0)
2642    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2643    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2644    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
2645    "uxtl       v5.8h, v5.8b                   \n"
2646    "uxtl       v6.8h, v6.8b                   \n"
2647    "uxtl       v7.8h, v7.8b                   \n"
2648    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
2649    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
2650    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
2651    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
2652    "uqxtn      v4.8b, v4.8h                   \n"
2653    "uqxtn      v5.8b, v5.8h                   \n"
2654    "uqxtn      v6.8b, v6.8h                   \n"
2655    "uqxtn      v7.8b, v7.8h                   \n"
2656    MEMACCESS(1)
2657    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2658    "b.gt       1b                             \n"
2659  : "+r"(src_argb),       // %0
2660    "+r"(dst_argb),       // %1
2661    "+r"(width)           // %2
2662  : "r"(value)            // %3
2663  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2664  );
2665}
2666#endif  // HAS_ARGBSHADEROW_NEON
2667
2668// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2669// Similar to ARGBToYJ but stores ARGB.
2670// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2671#ifdef HAS_ARGBGRAYROW_NEON
2672void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2673  asm volatile (
2674    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
2675    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
2676    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
2677  "1:                                          \n"
2678    MEMACCESS(0)
2679    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2680    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2681    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
2682    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
2683    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
2684    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
2685    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
2686    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
2687    MEMACCESS(1)
2688    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
2689    "b.gt       1b                             \n"
2690  : "+r"(src_argb),  // %0
2691    "+r"(dst_argb),  // %1
2692    "+r"(width)      // %2
2693  :
2694  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2695  );
2696}
2697#endif  // HAS_ARGBGRAYROW_NEON
2698
2699// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2700//    b = (r * 35 + g * 68 + b * 17) >> 7
2701//    g = (r * 45 + g * 88 + b * 22) >> 7
2702//    r = (r * 50 + g * 98 + b * 24) >> 7
2703
2704#ifdef HAS_ARGBSEPIAROW_NEON
2705void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2706  asm volatile (
2707    "movi       v20.8b, #17                    \n"  // BB coefficient
2708    "movi       v21.8b, #68                    \n"  // BG coefficient
2709    "movi       v22.8b, #35                    \n"  // BR coefficient
2710    "movi       v24.8b, #22                    \n"  // GB coefficient
2711    "movi       v25.8b, #88                    \n"  // GG coefficient
2712    "movi       v26.8b, #45                    \n"  // GR coefficient
2713    "movi       v28.8b, #24                    \n"  // BB coefficient
2714    "movi       v29.8b, #98                    \n"  // BG coefficient
2715    "movi       v30.8b, #50                    \n"  // BR coefficient
2716  "1:                                          \n"
2717    MEMACCESS(0)
2718    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
2719    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2720    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
2721    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
2722    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
2723    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
2724    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
2725    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
2726    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
2727    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
2728    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
2729    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
2730    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
2731    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
2732    MEMACCESS(0)
2733    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
2734    "b.gt       1b                             \n"
2735  : "+r"(dst_argb),  // %0
2736    "+r"(width)      // %1
2737  :
2738  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2739    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2740  );
2741}
2742#endif  // HAS_ARGBSEPIAROW_NEON
2743
2744// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2745// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2746// needs to saturate.  Consider doing a non-saturating version.
2747#ifdef HAS_ARGBCOLORMATRIXROW_NEON
2748void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2749                             const int8* matrix_argb, int width) {
2750  asm volatile (
2751    MEMACCESS(3)
2752    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
2753    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
2754    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
2755
2756  "1:                                          \n"
2757    MEMACCESS(0)
2758    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
2759    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2760    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
2761    "uxtl       v17.8h, v17.8b                 \n"  // g
2762    "uxtl       v18.8h, v18.8b                 \n"  // r
2763    "uxtl       v19.8h, v19.8b                 \n"  // a
2764    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
2765    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
2766    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
2767    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
2768    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
2769    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
2770    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
2771    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
2772    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2773    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2774    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2775    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2776    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
2777    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
2778    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
2779    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
2780    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2781    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2782    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2783    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2784    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
2785    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
2786    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
2787    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
2788    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2789    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2790    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2791    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2792    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
2793    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
2794    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
2795    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
2796    MEMACCESS(1)
2797    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
2798    "b.gt       1b                             \n"
2799  : "+r"(src_argb),   // %0
2800    "+r"(dst_argb),   // %1
2801    "+r"(width)       // %2
2802  : "r"(matrix_argb)  // %3
2803  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2804    "v18", "v19", "v22", "v23", "v24", "v25"
2805  );
2806}
2807#endif  // HAS_ARGBCOLORMATRIXROW_NEON
2808
2809// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2810// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2811#ifdef HAS_ARGBMULTIPLYROW_NEON
2812void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2813                          uint8* dst_argb, int width) {
2814  asm volatile (
2815    // 8 pixel loop.
2816  "1:                                          \n"
2817    MEMACCESS(0)
2818    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2819    MEMACCESS(1)
2820    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2821    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2822    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
2823    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
2824    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
2825    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
2826    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
2827    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
2828    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
2829    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
2830    MEMACCESS(2)
2831    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2832    "b.gt       1b                             \n"
2833
2834  : "+r"(src_argb0),  // %0
2835    "+r"(src_argb1),  // %1
2836    "+r"(dst_argb),   // %2
2837    "+r"(width)       // %3
2838  :
2839  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2840  );
2841}
2842#endif  // HAS_ARGBMULTIPLYROW_NEON
2843
2844// Add 2 rows of ARGB pixels together, 8 pixels at a time.
2845#ifdef HAS_ARGBADDROW_NEON
2846void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2847                     uint8* dst_argb, int width) {
2848  asm volatile (
2849    // 8 pixel loop.
2850  "1:                                          \n"
2851    MEMACCESS(0)
2852    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2853    MEMACCESS(1)
2854    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2855    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2856    "uqadd      v0.8b, v0.8b, v4.8b            \n"
2857    "uqadd      v1.8b, v1.8b, v5.8b            \n"
2858    "uqadd      v2.8b, v2.8b, v6.8b            \n"
2859    "uqadd      v3.8b, v3.8b, v7.8b            \n"
2860    MEMACCESS(2)
2861    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2862    "b.gt       1b                             \n"
2863
2864  : "+r"(src_argb0),  // %0
2865    "+r"(src_argb1),  // %1
2866    "+r"(dst_argb),   // %2
2867    "+r"(width)       // %3
2868  :
2869  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2870  );
2871}
2872#endif  // HAS_ARGBADDROW_NEON
2873
2874// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2875#ifdef HAS_ARGBSUBTRACTROW_NEON
2876void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2877                          uint8* dst_argb, int width) {
2878  asm volatile (
2879    // 8 pixel loop.
2880  "1:                                          \n"
2881    MEMACCESS(0)
2882    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2883    MEMACCESS(1)
2884    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2885    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2886    "uqsub      v0.8b, v0.8b, v4.8b            \n"
2887    "uqsub      v1.8b, v1.8b, v5.8b            \n"
2888    "uqsub      v2.8b, v2.8b, v6.8b            \n"
2889    "uqsub      v3.8b, v3.8b, v7.8b            \n"
2890    MEMACCESS(2)
2891    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2892    "b.gt       1b                             \n"
2893
2894  : "+r"(src_argb0),  // %0
2895    "+r"(src_argb1),  // %1
2896    "+r"(dst_argb),   // %2
2897    "+r"(width)       // %3
2898  :
2899  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2900  );
2901}
2902#endif  // HAS_ARGBSUBTRACTROW_NEON
2903
2904// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2905// A = 255
2906// R = Sobel
2907// G = Sobel
2908// B = Sobel
2909#ifdef HAS_SOBELROW_NEON
2910void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2911                     uint8* dst_argb, int width) {
2912  asm volatile (
2913    "movi       v3.8b, #255                    \n"  // alpha
2914    // 8 pixel loop.
2915  "1:                                          \n"
2916    MEMACCESS(0)
2917    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
2918    MEMACCESS(1)
2919    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
2920    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2921    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
2922    "orr        v1.8b, v0.8b, v0.8b            \n"
2923    "orr        v2.8b, v0.8b, v0.8b            \n"
2924    MEMACCESS(2)
2925    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2926    "b.gt       1b                             \n"
2927  : "+r"(src_sobelx),  // %0
2928    "+r"(src_sobely),  // %1
2929    "+r"(dst_argb),    // %2
2930    "+r"(width)        // %3
2931  :
2932  : "cc", "memory", "v0", "v1", "v2", "v3"
2933  );
2934}
2935#endif  // HAS_SOBELROW_NEON
2936
2937// Adds Sobel X and Sobel Y and stores Sobel into plane.
2938#ifdef HAS_SOBELTOPLANEROW_NEON
2939void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2940                          uint8* dst_y, int width) {
2941  asm volatile (
2942    // 16 pixel loop.
2943  "1:                                          \n"
2944    MEMACCESS(0)
2945    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
2946    MEMACCESS(1)
2947    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
2948    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
2949    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
2950    MEMACCESS(2)
2951    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
2952    "b.gt       1b                             \n"
2953  : "+r"(src_sobelx),  // %0
2954    "+r"(src_sobely),  // %1
2955    "+r"(dst_y),       // %2
2956    "+r"(width)        // %3
2957  :
2958  : "cc", "memory", "v0", "v1"
2959  );
2960}
2961#endif  // HAS_SOBELTOPLANEROW_NEON
2962
2963// Mixes Sobel X, Sobel Y and Sobel into ARGB.
2964// A = 255
2965// R = Sobel X
2966// G = Sobel
2967// B = Sobel Y
2968#ifdef HAS_SOBELXYROW_NEON
2969void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2970                     uint8* dst_argb, int width) {
2971  asm volatile (
2972    "movi       v3.8b, #255                    \n"  // alpha
2973    // 8 pixel loop.
2974  "1:                                          \n"
2975    MEMACCESS(0)
2976    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
2977    MEMACCESS(1)
2978    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
2979    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2980    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
2981    MEMACCESS(2)
2982    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2983    "b.gt       1b                             \n"
2984  : "+r"(src_sobelx),  // %0
2985    "+r"(src_sobely),  // %1
2986    "+r"(dst_argb),    // %2
2987    "+r"(width)        // %3
2988  :
2989  : "cc", "memory", "v0", "v1", "v2", "v3"
2990  );
2991}
2992#endif  // HAS_SOBELXYROW_NEON
2993
2994// SobelX as a matrix is
2995// -1  0  1
2996// -2  0  2
2997// -1  0  1
2998#ifdef HAS_SOBELXROW_NEON
2999void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
3000                    const uint8* src_y2, uint8* dst_sobelx, int width) {
3001  asm volatile (
3002  "1:                                          \n"
3003    MEMACCESS(0)
3004    "ld1        {v0.8b}, [%0],%5               \n"  // top
3005    MEMACCESS(0)
3006    "ld1        {v1.8b}, [%0],%6               \n"
3007    "usubl      v0.8h, v0.8b, v1.8b            \n"
3008    MEMACCESS(1)
3009    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
3010    MEMACCESS(1)
3011    "ld1        {v3.8b}, [%1],%6               \n"
3012    "usubl      v1.8h, v2.8b, v3.8b            \n"
3013    "add        v0.8h, v0.8h, v1.8h            \n"
3014    "add        v0.8h, v0.8h, v1.8h            \n"
3015    MEMACCESS(2)
3016    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
3017    MEMACCESS(2)
3018    "ld1        {v3.8b}, [%2],%6               \n"
3019    "subs       %w4, %w4, #8                   \n"  // 8 pixels
3020    "usubl      v1.8h, v2.8b, v3.8b            \n"
3021    "add        v0.8h, v0.8h, v1.8h            \n"
3022    "abs        v0.8h, v0.8h                   \n"
3023    "uqxtn      v0.8b, v0.8h                   \n"
3024    MEMACCESS(3)
3025    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
3026    "b.gt       1b                             \n"
3027  : "+r"(src_y0),      // %0
3028    "+r"(src_y1),      // %1
3029    "+r"(src_y2),      // %2
3030    "+r"(dst_sobelx),  // %3
3031    "+r"(width)        // %4
3032  : "r"(2LL),          // %5
3033    "r"(6LL)           // %6
3034  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
3035  );
3036}
3037#endif  // HAS_SOBELXROW_NEON
3038
3039// SobelY as a matrix is
3040// -1 -2 -1
3041//  0  0  0
3042//  1  2  1
3043#ifdef HAS_SOBELYROW_NEON
3044void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
3045                    uint8* dst_sobely, int width) {
3046  asm volatile (
3047  "1:                                          \n"
3048    MEMACCESS(0)
3049    "ld1        {v0.8b}, [%0],%4               \n"  // left
3050    MEMACCESS(1)
3051    "ld1        {v1.8b}, [%1],%4               \n"
3052    "usubl      v0.8h, v0.8b, v1.8b            \n"
3053    MEMACCESS(0)
3054    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
3055    MEMACCESS(1)
3056    "ld1        {v3.8b}, [%1],%4               \n"
3057    "usubl      v1.8h, v2.8b, v3.8b            \n"
3058    "add        v0.8h, v0.8h, v1.8h            \n"
3059    "add        v0.8h, v0.8h, v1.8h            \n"
3060    MEMACCESS(0)
3061    "ld1        {v2.8b}, [%0],%5               \n"  // right
3062    MEMACCESS(1)
3063    "ld1        {v3.8b}, [%1],%5               \n"
3064    "subs       %w3, %w3, #8                   \n"  // 8 pixels
3065    "usubl      v1.8h, v2.8b, v3.8b            \n"
3066    "add        v0.8h, v0.8h, v1.8h            \n"
3067    "abs        v0.8h, v0.8h                   \n"
3068    "uqxtn      v0.8b, v0.8h                   \n"
3069    MEMACCESS(2)
3070    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
3071    "b.gt       1b                             \n"
3072  : "+r"(src_y0),      // %0
3073    "+r"(src_y1),      // %1
3074    "+r"(dst_sobely),  // %2
3075    "+r"(width)        // %3
3076  : "r"(1LL),          // %4
3077    "r"(6LL)           // %5
3078  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
3079  );
3080}
3081#endif  // HAS_SOBELYROW_NEON
3082#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3083
3084#ifdef __cplusplus
3085}  // extern "C"
3086}  // namespace libyuv
3087#endif
3088