1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/*
2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *
4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  in the file PATENTS. All contributing project authors may
8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */
10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "libyuv/row.h"
12ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus
14ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramaniannamespace libyuv {
15ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianextern "C" {
16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
18ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// This module is for GCC Neon
197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    !defined(__aarch64__)
21ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
22ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y, 4 U and 4 V from 422
23ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV422                                                             \
24ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
25ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
26ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
27ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
28ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)                                                               \
29ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.32    {d2[1]}, [%2]!                 \n"
30ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
31ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y, 2 U and 2 V from 422
32ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV411                                                             \
33ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
34ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
35ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
36ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
37ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)                                                               \
38ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
39ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"                             \
40ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vzip.u8    d2, d3                         \n"
41ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
42ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y, 8 U and 8 V from 444
43ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV444                                                             \
44ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
45ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
46ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
47ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%1]!                    \n"                             \
48ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)                                                               \
49ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%2]!                    \n"                             \
50ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"                             \
51ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d2, q1, #1                     \n"
52ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
53ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y, and set 4 U and 4 V to 128
54ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV400                                                             \
55ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
56ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
57ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d2, #128                       \n"
58ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
59ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y and 4 UV from NV12
60ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READNV12                                                               \
61ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
62ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
63ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
64ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%1]!                    \n"                             \
65ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
66ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d2, d3                         \n"                             \
67ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtrn.u32   d2, d3                         \n"
68ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
69ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y and 4 VU from NV21
70ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READNV21                                                               \
71ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
72ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
73ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
74ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%1]!                    \n"                             \
75ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
76ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d3, d2                         \n"                             \
77ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtrn.u32   d2, d3                         \n"
78ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
79ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 YUY2
80ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUY2                                                               \
81ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
82ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d0, d2}, [%0]!                \n"                             \
83ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"                             \
84ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d2, d3                         \n"                             \
85ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtrn.u32   d2, d3                         \n"
86ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
87ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 UYVY
88ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READUYVY                                                               \
89ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
90ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d2, d3}, [%0]!                \n"                             \
91ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d0, d3                         \n"                             \
92ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"                             \
93ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d2, d3                         \n"                             \
94ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtrn.u32   d2, d3                         \n"
95ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YUV422TORGB_SETUP_REG                                                  \
977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS([kUVToRB])                                                       \
987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS([kUVToG])                                                        \
1007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
1017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS([kUVBiasBGR])                                                    \
1027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
1037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS([kUVBiasBGR])                                                    \
1047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
1057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS([kUVBiasBGR])                                                    \
1067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
1077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS([kYToRgb])                                                       \
1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define YUV422TORGB                                                            \
1117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
1127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
1137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmovl.u8   q0, d0                         \n" /* Y                      */\
1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmovl.s16  q10, d1                        \n"                             \
1157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmovl.s16  q0, d0                         \n"                             \
1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmul.s32   q10, q10, q15                  \n"                             \
1177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmul.s32   q0, q0, q15                    \n"                             \
1187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqshrun.s32 d0, q0, #16                   \n"                             \
1197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   d18, d19                       \n"                             \
1217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
1227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
1237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
1247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vaddw.u16  q1, q1, d16                    \n"                             \
1257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vaddw.u16  q10, q10, d17                  \n"                             \
1267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vaddw.u16  q3, q3, d18                    \n"                             \
1277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
1287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
1297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
1307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
1317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
1327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
1337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
1347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
1357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqshrun.s16 d21, q0, #6                   \n" /* G */
1367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// YUV to RGB conversion constants.
1387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Y contribution to R,G,B.  Scale and bias.
1397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
1417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// U and V contributions to R,G,B.
1437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UB -128 /* -min(128, round(2.018 * 64)) */
1447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UG 25 /* -round(-0.391 * 64) */
1457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VG 52 /* -round(-0.813 * 64) */
1467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VR -102 /* -round(1.596 * 64) */
1477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Bias values to subtract 16 from Y and 128 from U and V.
1497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BB (UB * 128            - YGB)
1507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BG (UG * 128 + VG * 128 - YGB)
1517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BR            (VR * 128 - YGB)
1527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,
1547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          0, 0, 0, 0, 0, 0, 0, 0 };
1557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
1567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        0, 0, 0, 0, 0, 0, 0, 0 };
1577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
1587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
1597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YG
1617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YGB
1627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UB
1637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UG
1647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VG
1657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VR
1667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BB
1677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BG
1687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BR
169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I444ToARGBRow_NEON(const uint8* src_y,
171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV444
180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %3
190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
1917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
1927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
1937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
1947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
1957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGBRow_NEON(const uint8* src_y,
201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %3
220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
2217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
2227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
2237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
2247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
2257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I411ToARGBRow_NEON(const uint8* src_y,
231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV411
240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %3
250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
2517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
2527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
2537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
2547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
2557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToBGRARow_NEON(const uint8* src_y,
261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_bgra,
264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d20, d22                       \n"
273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d19, #255                      \n"
274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_bgra),  // %3
281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
2827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
2837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
2847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
2857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
2867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToABGRRow_NEON(const uint8* src_y,
292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_abgr,
295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d20, d22                       \n"
304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_abgr),  // %3
312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
3137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
3147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
3157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
3167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
3177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGBARow_NEON(const uint8* src_y,
323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_rgba,
326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
3287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d19, #255                      \n"
335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgba),  // %3
342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
3437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
3447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
3457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
3467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
3477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGB24Row_NEON(const uint8* src_y,
353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* src_u,
354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* src_v,
355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_rgb24,
356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
3587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst3.8     {d20, d21, d22}, [%3]!         \n"
366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),      // %0
368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),      // %1
369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),      // %2
370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgb24),  // %3
371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)       // %4
3727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
3737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
3747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
3757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
3767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRAWRow_NEON(const uint8* src_y,
382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       const uint8* src_u,
383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       const uint8* src_v,
384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint8* dst_raw,
385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       int width) {
386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
3877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d20, d22                       \n"
394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst3.8     {d20, d21, d22}, [%3]!         \n"
396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),    // %0
398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),    // %1
399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),    // %2
400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_raw),  // %3
401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)     // %4
4027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
4037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
4047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
4057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
4067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGBTORGB565                                                           \
412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       q0, q0, q10                    \n"  /* BGR                  */
422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGB565Row_NEON(const uint8* src_y,
424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* src_u,
425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* src_v,
426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_rgb565,
427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int width) {
428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
4297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTORGB565
436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),    // %0
440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),    // %1
441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),    // %2
442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgb565),  // %3
443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)     // %4
4447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
4457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
4467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
4477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
4487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGBTOARGB1555                                                         \
454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGB1555Row_NEON(const uint8* src_y,
469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            const uint8* src_u,
470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            const uint8* src_v,
471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            uint8* dst_argb1555,
472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int width) {
473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
4747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTOARGB1555
482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),    // %0
486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),    // %1
487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),    // %2
488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb1555),  // %3
489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)     // %4
4907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
4917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
4927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
4937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
4947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGBTOARGB4444                                                         \
500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGB4444Row_NEON(const uint8* src_y,
509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            const uint8* src_u,
510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            const uint8* src_v,
511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            uint8* dst_argb4444,
512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int width) {
513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
5147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTOARGB4444
523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),    // %0
527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),    // %1
528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),    // %2
529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb4444),  // %3
530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)     // %4
5317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %5
5327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %6
5337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
5347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
5357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I400ToARGBRow_NEON(const uint8* src_y,
5417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
5427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
5447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV400
548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"
550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %1
556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %2
5577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %3
5587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %4
5597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
5607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
5617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J400ToARGBRow_NEON(const uint8* src_y,
567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
5717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ".p2align   2                              \n"
572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d20}, [%0]!                   \n"
575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov       d21, d20                       \n"
576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov       d22, d20                       \n"
577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"
578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %1
583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %2
584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    :
585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "cc", "memory", "d20", "d21", "d22", "d23"
586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid NV12ToARGBRow_NEON(const uint8* src_y,
590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_uv,
591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
5947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV12
598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"
600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_uv),    // %1
606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %2
607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %3
6087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %4
6097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %5
6107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
6117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
6127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid NV21ToARGBRow_NEON(const uint8* src_y,
618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_uv,
619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
6227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV21
626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"
628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_uv),    // %1
634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %2
635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %3
6367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %4
6377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %5
6387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
6397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
6407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid NV12ToRGB565Row_NEON(const uint8* src_y,
646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* src_uv,
647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_rgb565,
648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int width) {
649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
6507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV12
654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"
656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTORGB565
657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_uv),    // %1
662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgb565),  // %2
663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %3
6647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %4
6657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %5
6667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
6677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
6687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid NV21ToRGB565Row_NEON(const uint8* src_y,
674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* src_uv,
675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_rgb565,
676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int width) {
677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
6787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV21
682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"
684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTORGB565
685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_uv),    // %1
690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgb565),  // %2
691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %3
6927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %4
6937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %5
6947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
6957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
6967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToARGBRow_NEON(const uint8* src_yuy2,
702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
7057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUY2
709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"
711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_yuy2),  // %0
716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %1
717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %2
7187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %3
7197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %4
7207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
7217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
7227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid UYVYToARGBRow_NEON(const uint8* src_uyvy,
728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
7317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUV422TORGB_SETUP_REG
732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READUYVY
735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    YUV422TORGB
736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"
737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_uyvy),  // %0
742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %1
743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %2
7447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : [kUVToRB]"r"(&kUVToRB),   // %3
7457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVToG]"r"(&kUVToG),     // %4
7467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kUVBiasBGR]"r"(&kUVBiasBGR),
7477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      [kYToRgb]"r"(&kYToRgb)
7487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     int width) {
756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 processed per loop
762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store U
764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q1}, [%2]!                    \n"  // store V
766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_uv),  // %0
768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_u),   // %1
769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_v),   // %2
770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)    // %3  // Output registers
771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    :                       // Input registers
772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1"  // Clobber List
773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 16 U's and V's and writes out 16 pairs of UV.
777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     int width) {
779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load U
784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%1]!                    \n"  // load V
786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 processed per loop
787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    :
791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),   // %0
792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),   // %1
793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_uv),  // %2
794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)    // %3  // Output registers
795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    :                       // Input registers
796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1"  // Clobber List
797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid CopyRow_NEON(const uint8* src, uint8* dst, int count) {
802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #32                    \n"  // 32 processed per loop
808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src),   // %0
812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst),   // %1
813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(count)  // %2  // Output registers
814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :                     // Input registers
815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// SetRow writes 'count' bytes using an 8 bit value repeated.
8207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid SetRow_NEON(uint8* dst, uint8 v8, int count) {
821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
8227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
8237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  "1:                                          \n"
824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8    {q0}, [%0]!                     \n"  // store
827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt       1b                              \n"
828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(dst),   // %0
829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(count)  // %1
8307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  : "r"(v8)      // %2
831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0"
832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
8367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
8377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  asm volatile (
8387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
8397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  "1:                                          \n"
8407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
8417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS(0)
8427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vst1.8    {q0}, [%0]!                     \n"  // store
8437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "bgt       1b                              \n"
8447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  : "+r"(dst),   // %0
8457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "+r"(count)  // %1
8467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  : "r"(v32)     // %2
8477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  : "cc", "memory", "q0"
8487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );
849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Start at end of source row.
854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "mov        r3, #-16                       \n"
855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %0, %0, %2                     \n"
856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "sub        %0, #16                        \n"
857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, #16                        \n"  // 16 pixels per loop.
863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrev64.8   q0, q0                         \n"
864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"
868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src),   // %0
870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst),   // %1
871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)  // %2
872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "r3", "q0"
874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      int width) {
879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Start at end of source row.
881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "mov        r12, #-16                      \n"
882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %0, %0, %3, lsl #1             \n"
883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "sub        %0, #16                        \n"
884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, #8                         \n"  // 8 pixels per loop.
890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrev64.8   q0, q0                         \n"
891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"
895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_uv),  // %0
897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),   // %1
898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),   // %2
899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)    // %3
900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "r12", "q0"
902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Start at end of source row.
908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "mov        r3, #-16                       \n"
909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %0, %0, %2, lsl #2             \n"
910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "sub        %0, #16                        \n"
911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, #4                         \n"  // 4 pixels per loop.
917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrev64.32  q0, q0                         \n"
918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"
922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src),   // %0
924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst),   // %1
925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)  // %2
926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "r3", "q0"
928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #255                       \n"  // Alpha
934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb24),  // %0
943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %1
944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)         // %2
945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #255                       \n"  // Alpha
953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d1, d3                         \n"  // swap R, B
959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_raw),   // %0
963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),  // %1
964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define RGB565TOARGB                                                           \
971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // Alpha
985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb565),  // %0
995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %1
996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)          // %2
997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGB1555TOARGB                                                         \
1003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
1004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
1005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
1006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
1007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
1008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
1009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
1010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
1011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
1012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
1013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
1014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
1015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define RGB555TOARGB                                                           \
1018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
1019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
1020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
1021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
1022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
1023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
1024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
1025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
1026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
1027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
1028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
1030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int pix) {
1031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // Alpha
1033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
1037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB1555TOARGB
1039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
1041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb1555),  // %0
1043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %1
1044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)          // %2
1045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
1047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGB4444TOARGB                                                         \
1051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
1052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
1053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
1054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
1055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
1056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
1057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
1058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
1059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int pix) {
1062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // Alpha
1064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
1068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
1070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
1072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb4444),  // %0
1074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %1
1075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)          // %2
1076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
1078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
1082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
1087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
1090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),   // %0
1092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_rgb24),  // %1
1093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)         // %2
1094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
1096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
1100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
1105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d1, d3                         \n"  // swap R, B
1107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
1109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_raw),   // %1
1112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
1113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
1115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
1119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
1124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
1125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
1127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_yuy2),  // %0
1129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
1130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
1131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
1133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
1137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
1142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
1143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
1145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_uyvy),  // %0
1147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
1148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
1149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
1151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int pix) {
1156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
1161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
1164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
1166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_yuy2),  // %0
1168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
1170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %3
1171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
1173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int pix) {
1178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
1183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
1186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
1188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_uyvy),  // %0
1190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
1192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %3
1193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
1195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
1200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // stride + src_yuy2
1202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
1206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
1209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
1210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
1211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
1213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
1215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_yuy2),     // %0
1217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(stride_yuy2),  // %1
1218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),        // %2
1219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),        // %3
1220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)           // %4
1221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
1223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
1228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // stride + src_uyvy
1230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
1234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
1237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
1238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
1239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
1241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
1243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_uyvy),     // %0
1245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(stride_uyvy),  // %1
1246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),        // %2
1247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),        // %3
1248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)           // %4
1249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
1251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* shuffler, int pix) {
1257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q2}, [%3]                     \n"  // shuffler
1260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
1263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #4                     \n"  // 4 processed per loop
1264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
1265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
1266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
1268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),  // %1
1271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
1272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(shuffler)    // %3
1273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
1274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToYUY2Row_NEON(const uint8* src_y,
1278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
1279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
1280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_yuy2, int width) {
1281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
1286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
1288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
1290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 pixels
1291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
1293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_y),     // %0
1295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_u),     // %1
1296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_v),     // %2
1297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_yuy2),  // %3
1298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)      // %4
1299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3"
1301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToUYVYRow_NEON(const uint8* src_y,
1305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
1306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
1307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_uyvy, int width) {
1308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
1313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
1315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
1317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 pixels
1318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
1320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_y),     // %0
1322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_u),     // %1
1323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_v),     // %2
1324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_uyvy),  // %3
1325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)      // %4
1326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3"
1328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
1332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTORGB565
1339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
1341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_rgb565),  // %1
1344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
1345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
13517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                const uint32 dither4, int width) {
13527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  asm volatile (
13537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ".p2align   2                              \n"
13547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vdup.32    d2, %2                         \n"  // dither4
13557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  "1:                                          \n"
13567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS(1)
13577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
13587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
13597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqadd.u8   d20, d20, d2                   \n"
13607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqadd.u8   d21, d21, d2                   \n"
13617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vqadd.u8   d22, d22, d2                   \n"
13627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ARGBTORGB565
13637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    MEMACCESS(0)
13647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
13657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "bgt        1b                             \n"
13667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  : "+r"(dst_rgb)    // %0
13677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  : "r"(src_argb),   // %1
13687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "r"(dither4),    // %2
13697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "r"(width)       // %3
13707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
13717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );
13727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int pix) {
1376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTOARGB1555
1383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
1385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb1555),  // %1
1388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
1389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int pix) {
1396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
1398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTOARGB4444
1404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
1406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),      // %0
1408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb4444),  // %1
1409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)            // %2
1410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
1418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
1419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
1420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #16                       \n"  // Add 16 constant
1421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
1427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
1428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
1429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
1430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d27                        \n"
1431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
1433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
1436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
1437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
1439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
1445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
1446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
1447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
1453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
1454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
1455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
1456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
1458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
1461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
1462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
1464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 8x1 pixels.
1468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int pix) {
1470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
1472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
1473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
1474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
1475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
1476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
1482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
1483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlsl.u8   q2, d1, d25                    \n"  // G
1484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlsl.u8   q2, d2, d26                    \n"  // R
1485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
1486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q3, d2, d24                    \n"  // R
1488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlsl.u8   q3, d1, d28                    \n"  // G
1489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlsl.u8   q3, d0, d27                    \n"  // B
1490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
1491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
1493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
1494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
1503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %3
1504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
1506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int pix) {
1512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
1531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q0, q10                    \n"  // B
1532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q1, q11                    \n"  // G
1533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q2, q12                    \n"  // R
1534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
1535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q2, q10                    \n"  // R
1537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q1, q14                    \n"  // G
1538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q0, q13                    \n"  // B
1539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
1540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
1542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
1552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %3
1553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3",
1555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
1560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int pix) {
1562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
1580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
1582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
1583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
1584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
1585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
1587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d1, d8, d9                     \n"  // B
1588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
1589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d3, d10, d11                   \n"  // G
1590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
1591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d5, d12, d13                   \n"  // R
1592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
1598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q0, q10                    \n"  // B
1599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q1, q11                    \n"  // G
1600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q2, q12                    \n"  // R
1601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
1602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q2, q10                    \n"  // R
1603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q1, q14                    \n"  // G
1604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q0, q13                    \n"  // B
1605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
1606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
1607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
1616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %3
1617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define RGBTOUV(QB, QG, QR) \
1625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
1626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
1627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
1628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
1629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
1630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
1631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
1632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
1633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
1634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
1635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
1639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
1660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
1661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q0, q1, q2)
1670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_argb),  // %1
1677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
1679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
1680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Subsample match C code.
1687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint8* dst_u, uint8* dst_v, int pix) {
1689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
1692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
1693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
1694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
1695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
1696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
1710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
1711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q0, q1, q2)
1720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_argb),  // %1
1727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
1729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
1730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
1738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
1740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
1750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
1752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
1753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
1754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
1755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
1757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
1759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
1760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
1761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
1762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
1764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q3, q3, #1                     \n"
1766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q3, q2, q1)
1769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_bgra),  // %0
1775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_bgra),  // %1
1776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
1778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
1779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
1787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
1789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
1799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
1801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
1802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
1806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
1808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
1809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q2, q1, q0)
1818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_abgr),  // %0
1824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_abgr),  // %1
1825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
1827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
1828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
1836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
1838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
1848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
1850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
1851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
1852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
1853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
1855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
1857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
1858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
1859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
1860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q0, q1, q2)
1867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgba),  // %0
1873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_rgba),  // %1
1874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
1876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
1877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint8* dst_u, uint8* dst_v, int pix) {
1885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
1887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
1897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
1899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
1904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
1906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
1907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q0, q1, q2)
1916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb24),  // %0
1922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_rgb24),  // %1
1923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
1925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
1926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_u, uint8* dst_v, int pix) {
1934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_raw
1936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
1946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
1948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
1949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
1953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
1955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
1956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q2, q1, q0)
1965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_raw),  // %0
1971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_raw),  // %1
1972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
1974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
1975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_u, uint8* dst_v, int pix) {
1984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
1993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
1996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
1997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
1998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
1999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
2002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
2003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
2009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
2010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
2015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
2016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
2021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q5, q5, #1                     \n"
2022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q6, q6, #1                     \n"
2023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
2025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q4, q10                    \n"  // B
2026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q5, q11                    \n"  // G
2027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q6, q12                    \n"  // R
2028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
2029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q6, q10                    \n"  // R
2030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q5, q14                    \n"  // G
2031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q4, q13                    \n"  // B
2032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
2033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
2034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
2038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb565),  // %0
2041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_rgb565),  // %1
2042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
2043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
2044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
2045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
2053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_u, uint8* dst_v, int pix) {
2054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
2056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
2057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
2058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
2059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
2060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
2066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB555TOARGB
2067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
2072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB555TOARGB
2073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
2079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB555TOARGB
2080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
2085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB555TOARGB
2086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
2091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q5, q5, #1                     \n"
2092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q6, q6, #1                     \n"
2093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
2095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q4, q10                    \n"  // B
2096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q5, q11                    \n"  // G
2097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q6, q12                    \n"  // R
2098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
2099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q6, q10                    \n"  // R
2100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q5, q14                    \n"  // G
2101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q4, q13                    \n"  // B
2102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
2103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
2104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
2108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb1555),  // %0
2111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_argb1555),  // %1
2112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
2113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
2114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
2115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
2123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_u, uint8* dst_v, int pix) {
2124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
2126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
2127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
2128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
2129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
2130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
2136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
2137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
2142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
2143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
2149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
2150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
2151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
2152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
2155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
2156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
2157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
2158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
2159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
2161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q5, q5, #1                     \n"
2162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q6, q6, #1                     \n"
2163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
2165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q4, q10                    \n"  // B
2166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q5, q11                    \n"  // G
2167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q6, q12                    \n"  // R
2168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
2169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q6, q10                    \n"  // R
2170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q5, q14                    \n"  // G
2171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q4, q13                    \n"  // B
2172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
2173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
2174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
2178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
2179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb4444),  // %0
2181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_argb4444),  // %1
2182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
2183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
2184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %4
2185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
2192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
2201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
2203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
2204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
2205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
2206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d27                        \n"
2208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb565),  // %0
2212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),       // %1
2213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)          // %2
2214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
2220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
2229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB1555TOARGB
2231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
2232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
2233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
2234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d27                        \n"
2236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb1555),  // %0
2240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),         // %1
2241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)            // %2
2242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
2248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
2257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
2259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
2260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
2261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
2262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d27                        \n"
2264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb4444),  // %0
2268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),         // %1
2269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)            // %2
2270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
2276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
2285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d1, d4                     \n"  // R
2287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d5                     \n"  // G
2288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d3, d6                     \n"  // B
2289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_bgra),  // %0
2295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
2296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
2297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
2303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
2312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d0, d4                     \n"  // R
2314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d1, d5                     \n"  // G
2315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d6                     \n"  // B
2316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_abgr),  // %0
2322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),  // %1
2323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
2324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
2330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
2332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
2334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
2339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d1, d4                     \n"  // B
2341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d5                     \n"  // G
2342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d3, d6                     \n"  // R
2343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgba),  // %0
2349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),  // %1
2350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
2351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
2357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
2359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
2361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
2366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d0, d4                     \n"  // B
2368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d1, d5                     \n"  // G
2369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d6                     \n"  // R
2370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb24),  // %0
2376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),  // %1
2377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
2378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
2384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
2393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d0, d4                     \n"  // B
2395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d1, d5                     \n"  // G
2396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d6                     \n"  // R
2397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_raw),  // %0
2403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),  // %1
2404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(pix)        // %2
2405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear filter 16x2 -> 16x1
2411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid InterpolateRow_NEON(uint8* dst_ptr,
2412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* src_ptr, ptrdiff_t src_stride,
2413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int dst_width, int source_y_fraction) {
2414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "cmp        %4, #0                         \n"
2416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "beq        100f                           \n"
2417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %2, %1                         \n"
2418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "cmp        %4, #64                        \n"
2419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "beq        75f                            \n"
2420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "cmp        %4, #128                       \n"
2421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "beq        50f                            \n"
2422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "cmp        %4, #192                       \n"
2423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "beq        25f                            \n"
2424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.8     d5, %4                         \n"
2426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "rsb        %4, #256                       \n"
2427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.8     d4, %4                         \n"
2428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // General purpose row blend.
2429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"
2432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%2]!                    \n"
2434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"
2435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q13, d0, d4                    \n"
2436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q14, d1, d4                    \n"
2437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q13, d2, d5                    \n"
2438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q14, d3, d5                    \n"
2439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d0, q13, #8                    \n"
2440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d1, q14, #8                    \n"
2441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"
2443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "b          99f                            \n"
2445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 25 / 75.
2447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "25:                                         \n"
2448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"
2450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%2]!                    \n"
2452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"
2453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  q0, q1                         \n"
2454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  q0, q1                         \n"
2455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"
2457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        25b                            \n"
2458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "b          99f                            \n"
2459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 50 / 50.
2461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "50:                                         \n"
2462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"
2464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%2]!                    \n"
2466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"
2467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  q0, q1                         \n"
2468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"
2470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        50b                            \n"
2471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "b          99f                            \n"
2472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 75 / 25.
2474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "75:                                         \n"
2475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%1]!                    \n"
2477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%2]!                    \n"
2479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"
2480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  q0, q1                         \n"
2481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  q0, q1                         \n"
2482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"
2484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        75b                            \n"
2485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "b          99f                            \n"
2486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 100 / 0 - Copy row unchanged.
2488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "100:                                        \n"
2489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"
2491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"
2492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"
2494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        100b                           \n"
2495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "99:                                         \n"
2497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(dst_ptr),          // %0
2498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_ptr),          // %1
2499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride),       // %2
2500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_width),        // %3
2501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(source_y_fraction) // %4
2502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
2504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint8* dst_argb, int width) {
2510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, #8                         \n"
2512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "blt        89f                            \n"
2513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 8 pixels.
2514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "8:                                          \n"
2515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
2517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
2519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q10, d4, d3                    \n"  // db * a
2521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q11, d5, d3                    \n"  // dg * a
2522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q12, d6, d3                    \n"  // dr * a
2523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
2524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
2525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
2526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
2527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
2528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
2529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d2, d2, d6                     \n"  // + sr
2530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // a = 255
2531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
2533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bge        8b                             \n"
2534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "89:                                         \n"
2536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "adds       %3, #8-1                       \n"
2537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "blt        99f                            \n"
2538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 1 pixels.
2540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
2543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
2545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
2546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q10, d4, d3                    \n"  // db * a
2547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q11, d5, d3                    \n"  // dg * a
2548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q12, d6, d3                    \n"  // dr * a
2549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
2550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
2551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
2552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
2553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
2554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
2555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d2, d2, d6                     \n"  // + sr
2556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // a = 255
2557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
2559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bge        1b                             \n"
2560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "99:                                         \n"
2562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb0),    // %0
2564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_argb1),    // %1
2565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),     // %2
2566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)         // %3
2567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
2569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Attenuate 8 pixels at a time.
2573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Attenuate 8 pixels.
2576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
2579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q10, d0, d3                    \n"  // b * a
2581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q11, d1, d3                    \n"  // g * a
2582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q12, d2, d3                    \n"  // r * a
2583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
2584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
2585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
2586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
2588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),   // %0
2590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %1
2591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %2
2592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
2594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Quantize 8 ARGB pixels (32 bytes).
2598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// dst = (dst * scale >> 16) * interval_size + interval_offset;
2599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int interval_offset, int width) {
2601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.u16   q8, %2                         \n"
2603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
2604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.u16   q9, %3                         \n"  // interval multiply.
2605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.u16   q10, %4                        \n"  // interval add
2606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
2612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
2613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
2614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q1, d2                         \n"
2615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q2, d4                         \n"
2616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
2617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqdmulh.s16 q1, q1, q8                    \n"  // g
2618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqdmulh.s16 q2, q2, q8                    \n"  // r
2619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
2620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.u16   q1, q1, q9                     \n"  // g
2621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.u16   q2, q2, q9                     \n"  // r
2622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
2623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q1, q1, q10                    \n"  // g
2624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q2, q2, q10                    \n"  // r
2625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d0, q0                         \n"
2626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d2, q1                         \n"
2627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d4, q2                         \n"
2628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
2630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(dst_argb),       // %0
2632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)           // %1
2633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(scale),           // %2
2634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "r"(interval_size),   // %3
2635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "r"(interval_offset)  // %4
2636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
2637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shade 8 pixels at a time by specified value.
2641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint32 value) {
2645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
2647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
2648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
2649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
2655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
2657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q11, d22                       \n"
2658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q12, d24                       \n"
2659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q13, d26                       \n"
2660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
2661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
2662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
2663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
2664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d20, q10                       \n"
2665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d22, q11                       \n"
2666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d24, q12                       \n"
2667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d26, q13                       \n"
2668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
2670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),       // %0
2672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),       // %1
2673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)           // %2
2674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(value)            // %3
2675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
2676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Similar to ARGBToYJ but stores ARGB.
2681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
2685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
2686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
2687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
2693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
2694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
2695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
2696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov       d1, d0                         \n"  // G
2697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov       d2, d0                         \n"  // R
2698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
2700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
2702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),  // %1
2703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)      // %2
2704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
2706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    b = (r * 35 + g * 68 + b * 17) >> 7
2711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    g = (r * 45 + g * 88 + b * 22) >> 7
2712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    r = (r * 50 + g * 98 + b * 24) >> 7
2713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d20, #17                       \n"  // BB coefficient
2716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d21, #68                       \n"  // BG coefficient
2717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d22, #35                       \n"  // BR coefficient
2718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #22                       \n"  // GB coefficient
2719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #88                       \n"  // GG coefficient
2720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #45                       \n"  // GR coefficient
2721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d28, #24                       \n"  // BB coefficient
2722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d29, #98                       \n"  // BG coefficient
2723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d30, #50                       \n"  // BR coefficient
2724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
2728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
2729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
2730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d21                    \n"  // G
2731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d22                    \n"  // R
2732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
2733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q3, d1, d25                    \n"  // G
2734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q3, d2, d26                    \n"  // R
2735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
2736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d1, d29                    \n"  // G
2737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d30                    \n"  // R
2738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
2739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
2740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
2741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
2743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(dst_argb),  // %0
2745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)      // %1
2746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3",
2748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q10", "q11", "q12", "q13", "q14", "q15"
2749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// needs to saturate.  Consider doing a non-saturating version.
2755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                             const int8* matrix_argb, int width) {
2757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
2759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
2760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
2761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
2762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
2767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
2769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q9, d18                        \n"  // g
2770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q10, d20                       \n"  // r
27717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmovl.u8   q11, d22                       \n"  // a
2772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
2773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
2774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
2775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
2776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
2777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
2778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
2779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
2780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
2781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
2782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
2785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
2786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
2787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
2788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
2789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
2790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
27927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
27937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
27947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
27957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
2796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
2797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
2798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
2801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
2802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
2803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
2804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
2806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),   // %0
2808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %1
2809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %2
2810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(matrix_argb)  // %3
28117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
2812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q10", "q11", "q12", "q13", "q14", "q15"
2813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBMULTIPLYROW_NEON
2818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_argb, int width) {
2821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
2827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
2829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q0, d0, d1                     \n"  // multiply B
2831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q1, d2, d3                     \n"  // multiply G
2832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d4, d5                     \n"  // multiply R
2833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q3, d6, d7                     \n"  // multiply A
2834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
2835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
2836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
2837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
2838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb0),  // %0
2843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_argb1),  // %1
2844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %2
2845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %3
2846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"
2848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBMULTIPLYROW_NEON
2851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Add 2 rows of ARGB pixels together, 8 pixels at a time.
2853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
2855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
2863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
2865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
2866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb0),  // %0
2871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_argb1),  // %1
2872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %2
2873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %3
2874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"
2876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_argb, int width) {
2882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
2890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
2892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
2893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb0),  // %0
2898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_argb1),  // %1
2899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %2
2900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %3
2901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"
2903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// A = 255
2908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// R = Sobel
2909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G = Sobel
2910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// B = Sobel
2911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
2913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // alpha
2915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
2920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
2922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d0, d1                     \n"  // add
2924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d1, d0                         \n"
2925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d2, d0                         \n"
2926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_sobelx),  // %0
2930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_sobely),  // %1
2931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %2
2932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %3
2933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"
2935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Adds Sobel X and Sobel Y and stores Sobel into plane.
2939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_y, int width) {
2941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 16 pixel loop.
2943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
2947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
2949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
2950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q0, q0, q1                     \n"  // add
2951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
2953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_sobelx),  // %0
2955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_sobely),  // %1
2956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),       // %2
2957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %3
2958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"
2960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Mixes Sobel X, Sobel Y and Sobel into ARGB.
2964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// A = 255
2965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// R = Sobel X
2966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G = Sobel
2967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// B = Sobel Y
2968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
2970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // alpha
2972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
2974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
2977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
2979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d1, d0, d2                     \n"  // add
2981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_sobelx),  // %0
2985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_sobely),  // %1
2986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %2
2987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %3
2988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"
2990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// SobelX as a matrix is
2994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1  0  1
2995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -2  0  2
2996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1  0  1
2997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
2998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    const uint8* src_y2, uint8* dst_sobelx, int width) {
2999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
3000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
3001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
3002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
3003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0],%5                  \n"  // top
3004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
3005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d1}, [%0],%6                  \n"
3006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q0, d0, d1                     \n"
3007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
3008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
3009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
3010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%1],%6                  \n"
3011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q1, d2, d3                     \n"
3012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
3013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
3014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
3015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
3016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
3017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%2],%6                  \n"
3018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"  // 8 pixels
3019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q1, d2, d3                     \n"
3020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
3021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vabs.s16   q0, q0                         \n"
3022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d0, q0                         \n"
3023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
3024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
3025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
3026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_y0),      // %0
3027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_y1),      // %1
3028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_y2),      // %2
3029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_sobelx),  // %3
3030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %4
3031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(2),            // %5
3032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "r"(6)             // %6
3033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
3034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
3035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// SobelY as a matrix is
3038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1 -2 -1
3039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//  0  0  0
3040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//  1  2  1
3041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
3042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    uint8* dst_sobely, int width) {
3043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
3044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ".p2align   2                              \n"
3045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
3046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
3047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0],%4                  \n"  // left
3048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
3049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d1}, [%1],%4                  \n"
3050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q0, d0, d1                     \n"
3051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
3052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
3053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
3054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%1],%4                  \n"
3055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q1, d2, d3                     \n"
3056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
3057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
3058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
3059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%0],%5                  \n"  // right
3060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
3061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%1],%5                  \n"
3062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 pixels
3063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q1, d2, d3                     \n"
3064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
3065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vabs.s16   q0, q0                         \n"
3066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d0, q0                         \n"
3067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
3068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
3069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
3070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_y0),      // %0
3071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_y1),      // %1
3072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_sobely),  // %2
3073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %3
3074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(1),            // %4
3075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "r"(6)             // %5
3076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
3077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
3078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
30797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
3080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus
3082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}  // extern "C"
3083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}  // namespace libyuv
3084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
3085