1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/*
2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *
4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  in the file PATENTS. All contributing project authors may
8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */
10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "libyuv/row.h"
12ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus
14ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramaniannamespace libyuv {
15ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianextern "C" {
16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
18ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// This module is for GCC Neon
19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    !defined(__aarch64__)
21ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
22ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y, 4 U and 4 V from 422
23ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV422                                                             \
24ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
25ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
26ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
27ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
28ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)                                                               \
29ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.32    {d2[1]}, [%2]!                 \n"
30ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
31ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y, 2 U and 2 V from 422
32ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV411                                                             \
33ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
34ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
35ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
36ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
37ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)                                                               \
38ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
39ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"                             \
40ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vzip.u8    d2, d3                         \n"
41ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
42ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y, 8 U and 8 V from 444
43ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV444                                                             \
44ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
45ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
46ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
47ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%1]!                    \n"                             \
48ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)                                                               \
49ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%2]!                    \n"                             \
50ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"                             \
51ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d2, q1, #1                     \n"
52ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
53ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y, and set 4 U and 4 V to 128
54ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV400                                                             \
55ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
56ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
57ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d2, #128                       \n"
58ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
59ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y and 4 UV from NV12
60ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READNV12                                                               \
61ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
62ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
63ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
64ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%1]!                    \n"                             \
65ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
66ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d2, d3                         \n"                             \
67ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtrn.u32   d2, d3                         \n"
68ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
69ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 Y and 4 VU from NV21
70ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READNV21                                                               \
71ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
72ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"                             \
73ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)                                                               \
74ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%1]!                    \n"                             \
75ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
76ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d3, d2                         \n"                             \
77ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtrn.u32   d2, d3                         \n"
78ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
79ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 YUY2
80ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUY2                                                               \
81ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
82ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d0, d2}, [%0]!                \n"                             \
83ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"                             \
84ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d2, d3                         \n"                             \
85ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtrn.u32   d2, d3                         \n"
86ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
87ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 UYVY
88ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READUYVY                                                               \
89ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)                                                               \
90ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d2, d3}, [%0]!                \n"                             \
91ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d0, d3                         \n"                             \
92ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, d2                         \n"                             \
93ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d2, d3                         \n"                             \
94ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtrn.u32   d2, d3                         \n"
95ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
967bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define YUVTORGB_SETUP                                                         \
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS([kUVToRB])                                                       \
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS([kUVToG])                                                        \
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS([kUVBiasBGR])                                                    \
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS([kUVBiasBGR])                                                    \
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS([kUVBiasBGR])                                                    \
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS([kYToRgb])                                                       \
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1107bc9febe8749e98a3812a0dc4380ceae75c29450Johann#define YUVTORGB                                                               \
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmovl.u8   q0, d0                         \n" /* Y                      */\
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmovl.s16  q10, d1                        \n"                             \
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmovl.s16  q0, d0                         \n"                             \
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmul.s32   q10, q10, q15                  \n"                             \
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmul.s32   q0, q0, q15                    \n"                             \
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqshrun.s32 d0, q0, #16                   \n"                             \
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   d18, d19                       \n"                             \
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vaddw.u16  q1, q1, d16                    \n"                             \
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vaddw.u16  q10, q10, d17                  \n"                             \
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vaddw.u16  q3, q3, d18                    \n"                             \
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqshrun.s16 d21, q0, #6                   \n" /* G */
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I444ToARGBRow_NEON(const uint8* src_y,
138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
1417bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const struct YuvConstants* yuvconstants,
142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
1457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vmov.u8    d23, #255                      \n"
146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV444
1487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %3
157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
1587bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
1597bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
1607bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
1617bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGBRow_NEON(const uint8* src_y,
168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
1717bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const struct YuvConstants* yuvconstants,
172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1747bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    READYUV422
1787bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %3
187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
1887bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
1897bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
1907bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
1917bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1977bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid I422AlphaToARGBRow_NEON(const uint8* src_y,
1987bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             const uint8* src_u,
1997bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             const uint8* src_v,
2007bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             const uint8* src_a,
2017bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             uint8* dst_argb,
2027bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             const struct YuvConstants* yuvconstants,
2037bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             int width) {
204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
2087bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
2097bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "subs       %5, %5, #8                     \n"
210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
2117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vld1.8     {d23}, [%3]!                   \n"
2127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    MEMACCESS(4)
2137bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
2187bc9febe8749e98a3812a0dc4380ceae75c29450Johann      "+r"(src_a),     // %3
2197bc9febe8749e98a3812a0dc4380ceae75c29450Johann      "+r"(dst_argb),  // %4
2207bc9febe8749e98a3812a0dc4380ceae75c29450Johann      "+r"(width)      // %5
2217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
2227bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
2237bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
2247bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2307bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid I411ToARGBRow_NEON(const uint8* src_y,
231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
2337bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        uint8* dst_argb,
2347bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const struct YuvConstants* yuvconstants,
235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
2387bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vmov.u8    d23, #255                      \n"
239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    READYUV411
2417bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
2497bc9febe8749e98a3812a0dc4380ceae75c29450Johann      "+r"(dst_argb),  // %3
250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
2517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
2527bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
2537bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
2547bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGBARow_NEON(const uint8* src_y,
261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_rgba,
2647bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const struct YuvConstants* yuvconstants,
265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2677bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
2707bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
2727bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB
273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),     // %1
278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),     // %2
279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgba),  // %3
280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %4
2817bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
2827bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
2837bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
2847bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGB24Row_NEON(const uint8* src_y,
291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* src_u,
292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* src_v,
293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_rgb24,
2947bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         const struct YuvConstants* yuvconstants,
295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2977bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
3007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst3.8     {d20, d21, d22}, [%3]!         \n"
304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),      // %0
306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),      // %1
307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),      // %2
308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgb24),  // %3
309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)       // %4
3107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
3117bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
3127bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
3137bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGBTORGB565                                                           \
3207bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vshll.u8    q0, d22, #8                   \n"  /* R                    */ \
3217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vshll.u8    q8, d21, #8                   \n"  /* G                    */ \
3227bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vshll.u8    q9, d20, #8                   \n"  /* B                    */ \
3237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vsri.16     q0, q8, #5                    \n"  /* RG                   */ \
3247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vsri.16     q0, q9, #11                   \n"  /* RGB                  */
325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGB565Row_NEON(const uint8* src_y,
327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* src_u,
328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* src_v,
329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_rgb565,
3307bc9febe8749e98a3812a0dc4380ceae75c29450Johann                          const struct YuvConstants* yuvconstants,
331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int width) {
332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
3337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
3367bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTORGB565
339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),    // %0
343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),    // %1
344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),    // %2
345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgb565),  // %3
346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)     // %4
3477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
3487bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
3497bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
3507bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
351da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGBTOARGB1555                                                         \
3577bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vshll.u8    q0, d23, #8                   \n"  /* A                    */ \
3587bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vshll.u8    q8, d22, #8                   \n"  /* R                    */ \
3597bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vshll.u8    q9, d21, #8                   \n"  /* G                    */ \
3607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vshll.u8    q10, d20, #8                  \n"  /* B                    */ \
3617bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vsri.16     q0, q8, #1                    \n"  /* AR                   */ \
3627bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vsri.16     q0, q9, #6                    \n"  /* ARG                  */ \
3637bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vsri.16     q0, q10, #11                  \n"  /* ARGB                 */
364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGB1555Row_NEON(const uint8* src_y,
366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            const uint8* src_u,
367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            const uint8* src_v,
368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            uint8* dst_argb1555,
3697bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            const struct YuvConstants* yuvconstants,
370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int width) {
371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
3727bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
3757bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTOARGB1555
379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),    // %0
383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),    // %1
384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),    // %2
385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb1555),  // %3
386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)     // %4
3877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
3887bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
3897bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
3907bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
391da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGBTOARGB4444                                                         \
397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGB4444Row_NEON(const uint8* src_y,
406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            const uint8* src_u,
407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            const uint8* src_v,
408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            uint8* dst_argb4444,
4097bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            const struct YuvConstants* yuvconstants,
410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int width) {
411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
4127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
4167bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"
418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTOARGB4444
420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),    // %0
424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),    // %1
425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),    // %2
426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb4444),  // %3
427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)     // %4
4287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
4297bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
4307bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
4317bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
432da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
437da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid I400ToARGBRow_NEON(const uint8* src_y,
438da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                        uint8* dst_argb,
439da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                        int width) {
440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
4417bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
4427bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vmov.u8    d23, #255                      \n"
443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV400
4457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"
447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %1
452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %2
4537bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
4547bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
4557bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
4567bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
457da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
462da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid J400ToARGBRow_NEON(const uint8* src_y,
463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d23, #255                      \n"
467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d20}, [%0]!                   \n"
470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov       d21, d20                       \n"
471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov       d22, d20                       \n"
472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"
473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %1
478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %2
479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    :
480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "cc", "memory", "d20", "d21", "d22", "d23"
481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid NV12ToARGBRow_NEON(const uint8* src_y,
485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_uv,
486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
4877bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const struct YuvConstants* yuvconstants,
488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
4907bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
4917bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vmov.u8    d23, #255                      \n"
492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV12
4947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"
496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_uv),    // %1
501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %2
502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %3
5037bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
5047bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
5057bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
5067bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
507da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid NV21ToARGBRow_NEON(const uint8* src_y,
5137bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const uint8* src_vu,
514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
5157bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const struct YuvConstants* yuvconstants,
516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
5187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
5197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vmov.u8    d23, #255                      \n"
520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV21
5227bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"
524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
5287bc9febe8749e98a3812a0dc4380ceae75c29450Johann      "+r"(src_vu),    // %1
529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %2
530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %3
5317bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
5327bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
5337bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
5347bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
535da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid NV12ToRGB565Row_NEON(const uint8* src_y,
541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* src_uv,
542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_rgb565,
5437bc9febe8749e98a3812a0dc4380ceae75c29450Johann                          const struct YuvConstants* yuvconstants,
544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int width) {
545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
5467bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV12
5497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"
551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTORGB565
552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_y),     // %0
556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_uv),    // %1
557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_rgb565),  // %2
558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %3
5597bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
5607bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
5617bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
5627bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToARGBRow_NEON(const uint8* src_yuy2,
569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
5707bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const struct YuvConstants* yuvconstants,
571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
5737bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
5747bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vmov.u8    d23, #255                      \n"
575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUY2
5777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"
579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_yuy2),  // %0
583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %1
584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %2
5857bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
5867bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
5877bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
5887bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
589da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid UYVYToARGBRow_NEON(const uint8* src_uyvy,
595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
5967bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        const struct YuvConstants* yuvconstants,
597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
5997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB_SETUP
6007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vmov.u8    d23, #255                      \n"
601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READUYVY
6037bc9febe8749e98a3812a0dc4380ceae75c29450Johann    YUVTORGB
604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"
605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_uyvy),  // %0
609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_argb),  // %1
610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)      // %2
6117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
6127bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVToG]"r"(&yuvconstants->kUVToG),
6137bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
6147bc9febe8749e98a3812a0dc4380ceae75c29450Johann      [kYToRgb]"r"(&yuvconstants->kYToRgb)
615da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     int width) {
623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 processed per loop
628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store U
630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q1}, [%2]!                    \n"  // store V
632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "+r"(src_uv),  // %0
634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_u),   // %1
635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_v),   // %2
636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)    // %3  // Output registers
637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    :                       // Input registers
638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1"  // Clobber List
639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 16 U's and V's and writes out 16 pairs of UV.
643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     int width) {
645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load U
649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%1]!                    \n"  // load V
651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 processed per loop
652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    :
656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_u),   // %0
657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(src_v),   // %1
658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(dst_uv),  // %2
659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      "+r"(width)    // %3  // Output registers
660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    :                       // Input registers
661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    : "cc", "memory", "q0", "q1"  // Clobber List
662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid CopyRow_NEON(const uint8* src, uint8* dst, int count) {
667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #32                    \n"  // 32 processed per loop
672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src),   // %0
676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst),   // %1
677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(count)  // %2  // Output registers
678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :                     // Input registers
679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
683da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian// SetRow writes 'count' bytes using an 8 bit value repeated.
684da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid SetRow_NEON(uint8* dst, uint8 v8, int count) {
685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
686da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
687da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  "1:                                          \n"
688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8    {q0}, [%0]!                     \n"  // store
691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt       1b                              \n"
692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(dst),   // %0
693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(count)  // %1
694da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  : "r"(v8)      // %2
695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0"
696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
699da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
700da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
701da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  asm volatile (
702da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
703da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  "1:                                          \n"
704da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
705da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS(0)
706da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vst1.8    {q0}, [%0]!                     \n"  // store
707da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "bgt       1b                              \n"
708da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  : "+r"(dst),   // %0
709da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "+r"(count)  // %1
710da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  : "r"(v32)     // %2
711da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  : "cc", "memory", "q0"
712da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  );
713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Start at end of source row.
718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "mov        r3, #-16                       \n"
719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %0, %0, %2                     \n"
720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "sub        %0, #16                        \n"
721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, #16                        \n"  // 16 pixels per loop.
726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrev64.8   q0, q0                         \n"
727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"
731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src),   // %0
733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst),   // %1
734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)  // %2
735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "r3", "q0"
737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      int width) {
742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Start at end of source row.
744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "mov        r12, #-16                      \n"
745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %0, %0, %3, lsl #1             \n"
746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "sub        %0, #16                        \n"
747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, #8                         \n"  // 8 pixels per loop.
752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrev64.8   q0, q0                         \n"
753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"
757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_uv),  // %0
759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),   // %1
760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),   // %2
761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)    // %3
762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "r12", "q0"
764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Start at end of source row.
770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "mov        r3, #-16                       \n"
771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %0, %0, %2, lsl #2             \n"
772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "sub        %0, #16                        \n"
773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, #4                         \n"  // 4 pixels per loop.
778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrev64.32  q0, q0                         \n"
779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"
783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src),   // %0
785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst),   // %1
786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)  // %2
787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "r3", "q0"
789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
7927bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #255                       \n"  // Alpha
795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb24),  // %0
803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %1
8047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)         // %2
805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8107bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #255                       \n"  // Alpha
813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d1, d3                         \n"  // swap R, B
818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_raw),   // %0
822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),  // %1
8237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)      // %2
824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8297bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
8307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  asm volatile (
8317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  "1:                                          \n"
8327bc9febe8749e98a3812a0dc4380ceae75c29450Johann    MEMACCESS(0)
8337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
8347bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
8357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vswp.u8    d1, d3                         \n"  // swap R, B
8367bc9febe8749e98a3812a0dc4380ceae75c29450Johann    MEMACCESS(1)
8377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
8387bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "bgt        1b                             \n"
8397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  : "+r"(src_raw),    // %0
8407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(dst_rgb24),  // %1
8417bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)       // %2
8427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  :
8437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  : "cc", "memory", "d1", "d2", "d3"  // Clobber List
8447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  );
8457bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
8467bc9febe8749e98a3812a0dc4380ceae75c29450Johann
847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define RGB565TOARGB                                                           \
848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8597bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // Alpha
862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb565),  // %0
871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %1
8727bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)          // %2
873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGB1555TOARGB                                                         \
879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define RGB555TOARGB                                                           \
894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
9067bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int width) {
907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // Alpha
909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB1555TOARGB
914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb1555),  // %0
918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %1
9197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)          // %2
920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define ARGB4444TOARGB                                                         \
926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
9367bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int width) {
937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // Alpha
939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb4444),  // %0
948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %1
9497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)          // %2
950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9557bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),   // %0
965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_rgb24),  // %1
9667bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)         // %2
967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9727bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vswp.u8    d1, d3                         \n"  // swap R, B
979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_raw),   // %1
9847bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9907bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_yuy2),  // %0
1000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
10017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
1002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
1004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
10077bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
1008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
1012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
1013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
1015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_uyvy),  // %0
1017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
10187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
1019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
1021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
10257bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         int width) {
1026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
1030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
1033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
1035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_yuy2),  // %0
1037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
10397bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %3
1040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
1042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
10467bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         int width) {
1047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
1051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
1054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
1056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_uyvy),  // %0
1058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
10607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %3
1061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
1063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
10677bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      uint8* dst_u, uint8* dst_v, int width) {
1068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // stride + src_yuy2
1070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
1073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
1076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
1077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
1078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
1080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
1082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_yuy2),     // %0
1084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(stride_yuy2),  // %1
1085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),        // %2
1086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),        // %3
10877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)           // %4
1088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
1090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
10947bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      uint8* dst_u, uint8* dst_v, int width) {
1095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // stride + src_uyvy
1097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
1100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
1103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
1104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
1105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
1107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
1109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_uyvy),     // %0
1111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(stride_uyvy),  // %1
1112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),        // %2
1113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),        // %3
11147bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)           // %4
1115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
1117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
11227bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         const uint8* shuffler, int width) {
1123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q2}, [%3]                     \n"  // shuffler
1126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
1129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #4                     \n"  // 4 processed per loop
1130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
1131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
1132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
1134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),  // %1
11377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
1138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(shuffler)    // %3
1139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
1140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToYUY2Row_NEON(const uint8* src_y,
1144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
1145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
1146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_yuy2, int width) {
1147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
1151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
1153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
1155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 pixels
1156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
1158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_y),     // %0
1160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_u),     // %1
1161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_v),     // %2
1162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_yuy2),  // %3
1163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)      // %4
1164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3"
1166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToUYVYRow_NEON(const uint8* src_y,
1170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
1171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
1172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_uyvy, int width) {
1173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
1177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
1179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
1181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 pixels
1182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
1184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_y),     // %0
1186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_u),     // %1
1187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_v),     // %2
1188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_uyvy),  // %3
1189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)      // %4
1190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3"
1192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11957bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTORGB565
1202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
1204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_rgb565),  // %1
12077bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
1208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                                const uint32 dither4, int width) {
1215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  asm volatile (
1216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vdup.32    d2, %2                         \n"  // dither4
1217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  "1:                                          \n"
1218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS(1)
1219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
1220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
1221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqadd.u8   d20, d20, d2                   \n"
1222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqadd.u8   d21, d21, d2                   \n"
1223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vqadd.u8   d22, d22, d2                   \n"
1224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ARGBTORGB565
1225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    MEMACCESS(0)
1226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
1227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "bgt        1b                             \n"
1228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  : "+r"(dst_rgb)    // %0
1229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  : "r"(src_argb),   // %1
1230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "r"(dither4),    // %2
1231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "r"(width)       // %3
1232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
1233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  );
1234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
1235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
12377bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int width) {
1238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTOARGB1555
1244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
1246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb1555),  // %1
12497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
1250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
12567bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int width) {
1257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
1259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGBTOARGB4444
1264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
1266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),      // %0
1268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb4444),  // %1
12697bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)            // %2
1270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
12757bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
1278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
1279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
1280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #16                       \n"  // Add 16 constant
1281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
1286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
1287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
1288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
1289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d27                        \n"
1290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
1292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
12957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
1296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
1298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13017bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
13027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  asm volatile (
13037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  "1:                                          \n"
13047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    MEMACCESS(0)
13057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
13067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
13077bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "subs       %2, %2, #16                    \n"  // 16 processed per loop
13087bc9febe8749e98a3812a0dc4380ceae75c29450Johann    MEMACCESS(1)
13097bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
13107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "bgt       1b                              \n"
13117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  : "+r"(src_argb),   // %0
13127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(dst_a),      // %1
13137bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)       // %2
13147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  :
13157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
13167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  );
13177bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
13187bc9febe8749e98a3812a0dc4380ceae75c29450Johann
13197bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
1322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
1323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
1324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
1329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
1330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
1331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
1332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
1334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
13377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
1338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
1340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 8x1 pixels.
1344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
13457bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         int width) {
1346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
1348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
1349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
1350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
1351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
1352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
1357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
1358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlsl.u8   q2, d1, d25                    \n"  // G
1359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlsl.u8   q2, d2, d26                    \n"  // R
1360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
1361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q3, d2, d24                    \n"  // R
1363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlsl.u8   q3, d1, d28                    \n"  // G
1364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlsl.u8   q3, d0, d27                    \n"  // B
1365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
1366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
1368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
1369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
13787bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %3
1379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
1381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13847bc9febe8749e98a3812a0dc4380ceae75c29450Johann// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
1385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
13867bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         int width) {
1387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
1404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
1406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
1407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
1408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
1409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
1411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d1, d8, d9                     \n"  // B
1412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
1413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d3, d10, d11                   \n"  // G
1414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
1415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadd.u16  d5, d12, d13                   \n"  // R
1416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
1422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q0, q10                    \n"  // B
1423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q1, q11                    \n"  // G
1424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q2, q12                    \n"  // R
1425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
1426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q2, q10                    \n"  // R
1427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q1, q14                    \n"  // G
1428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q0, q13                    \n"  // B
1429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
1430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
1431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %1
1439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %2
14407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %3
1441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
14477bc9febe8749e98a3812a0dc4380ceae75c29450Johann// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define RGBTOUV(QB, QG, QR) \
1449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
1450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
1451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
1452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
1453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
1454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
1455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
1456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
1457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
1458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
1459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
14627bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      uint8* dst_u, uint8* dst_v, int width) {
1463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
1483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
1484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q0, q1, q2)
1493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_argb),  // %1
1500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
15027bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Subsample match C code.
1510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
15117bc9febe8749e98a3812a0dc4380ceae75c29450Johann                       uint8* dst_u, uint8* dst_v, int width) {
1512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
1515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
1516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
1517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
1518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
1519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
1525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
1532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
1533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q0, q1, q2)
1542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
1548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_argb),  // %1
1549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
15517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
15597bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      uint8* dst_u, uint8* dst_v, int width) {
1560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
1562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
1571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
1573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
1574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
1575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
1576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
1578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
1580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
1581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
1582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
1583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
1585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q3, q3, #1                     \n"
1587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q3, q2, q1)
1590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_bgra),  // %0
1596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_bgra),  // %1
1597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
15997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
16077bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      uint8* dst_u, uint8* dst_v, int width) {
1608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
1610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
1619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
1621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
1622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
1626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
1628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
1629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q2, q1, q0)
1638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_abgr),  // %0
1644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_abgr),  // %1
1645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
16477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
16557bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      uint8* dst_u, uint8* dst_v, int width) {
1656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
1658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
1667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
1669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
1670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
1671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
1672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
1674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
1676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
1677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
1678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
1679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q0, q1, q2)
1686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgba),  // %0
1692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_rgba),  // %1
1693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
16957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
17037bc9febe8749e98a3812a0dc4380ceae75c29450Johann                       uint8* dst_u, uint8* dst_v, int width) {
1704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
1706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
1715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
1717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
1718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
1722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
1724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
1725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q0, q1, q2)
1734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb24),  // %0
1740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_rgb24),  // %1
1741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
17437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
17517bc9febe8749e98a3812a0dc4380ceae75c29450Johann                     uint8* dst_u, uint8* dst_v, int width) {
1752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_raw
1754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
1763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
1765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
1766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
1767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
1770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
1772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
1773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
1774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
1777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q1, q1, #1                     \n"
1778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q2, q2, #1                     \n"
1779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGBTOUV(q2, q1, q0)
1782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_raw),  // %0
1788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_raw),  // %1
1789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
17917bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
17987bc9febe8749e98a3812a0dc4380ceae75c29450Johann// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
18007bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        uint8* dst_u, uint8* dst_v, int width) {
1801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
1812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
1813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
1814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
1815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
1818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
1819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
1820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
1821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
1822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
1825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
1826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
1827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
1828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
1831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
1832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
1833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
1834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
1835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
1837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q5, q5, #1                     \n"
1838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q6, q6, #1                     \n"
1839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
1841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q4, q10                    \n"  // B
1842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q5, q11                    \n"  // G
1843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q6, q12                    \n"  // R
1844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
1845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q6, q10                    \n"  // R
1846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q5, q14                    \n"  // G
1847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q4, q13                    \n"  // B
1848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
1849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
1850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb565),  // %0
1857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_rgb565),  // %1
1858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
18607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
18677bc9febe8749e98a3812a0dc4380ceae75c29450Johann// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
18697bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        uint8* dst_u, uint8* dst_v, int width) {
1870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
1881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB555TOARGB
1882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
1883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
1884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
1887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB555TOARGB
1888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
1889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
1890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
1891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
1894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB555TOARGB
1895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
1896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
1897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
1900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB555TOARGB
1901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
1902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
1903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
1904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
1906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q5, q5, #1                     \n"
1907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q6, q6, #1                     \n"
1908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
1910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q4, q10                    \n"  // B
1911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q5, q11                    \n"  // G
1912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q6, q12                    \n"  // R
1913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
1914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q6, q10                    \n"  // R
1915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q5, q14                    \n"  // G
1916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q4, q13                    \n"  // B
1917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
1918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
1919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb1555),  // %0
1926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_argb1555),  // %1
1927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
19297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
1931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
1934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
19367bc9febe8749e98a3812a0dc4380ceae75c29450Johann// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
19387bc9febe8749e98a3812a0dc4380ceae75c29450Johann                          uint8* dst_u, uint8* dst_v, int width) {
1939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
1940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
1942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
1943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
1944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
1945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
1948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
1950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
1951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
1952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
1953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
1955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
1956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
1957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
1958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
1959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
1960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
1963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
1964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
1965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
1966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
1968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
1969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
1970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
1971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
1972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
1973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
1975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q5, q5, #1                     \n"
1976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshr.u16  q6, q6, #1                     \n"
1977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
1979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q8, q4, q10                    \n"  // B
1980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q5, q11                    \n"  // G
1981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q8, q6, q12                    \n"  // R
1982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
1983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q9, q6, q10                    \n"  // R
1984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q5, q14                    \n"  // G
1985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmls.s16   q9, q4, q13                    \n"  // B
1986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
1987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
1988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
1990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
1992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
1993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
1994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb4444),  // %0
1995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride_argb4444),  // %1
1996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_u),     // %2
1997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_v),     // %3
19987bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %4
1999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
20057bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
2006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
2014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    RGB565TOARGB
2016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
2017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
2018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
2019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d27                        \n"
2021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb565),  // %0
2025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),       // %1
20267bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)          // %2
2027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
20327bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
2033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
2041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB1555TOARGB
2043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
2044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
2045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
2046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d27                        \n"
2048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb1555),  // %0
2052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),         // %1
20537bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)            // %2
2054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
20597bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
2060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
2062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
2063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
2064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
2068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ARGB4444TOARGB
2070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
2071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
2072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
2073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
2074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d27                        \n"
2075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb4444),  // %0
2079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),         // %1
20807bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)            // %2
2081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
20867bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
2095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d1, d4                     \n"  // R
2097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d5                     \n"  // G
2098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d3, d6                     \n"  // B
2099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_bgra),  // %0
2105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),     // %1
21067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
2107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
21127bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
2121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d0, d4                     \n"  // R
2123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d1, d5                     \n"  // G
2124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d6                     \n"  // B
2125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_abgr),  // %0
2131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),  // %1
21327bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
2133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
21387bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
2141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
2143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
2147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d1, d4                     \n"  // B
2149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d5                     \n"  // G
2150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d3, d6                     \n"  // R
2151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgba),  // %0
2157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),  // %1
21587bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
2159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
21647bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
2167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
2169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
2173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d0, d4                     \n"  // B
2175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d1, d5                     \n"  // G
2176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d6                     \n"  // R
2177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_rgb24),  // %0
2183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),  // %1
21847bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
2185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
21907bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
2193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
2194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
2195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
2199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d0, d4                     \n"  // B
2201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d1, d5                     \n"  // G
2202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d6                     \n"  // R
2203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
2204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d7                         \n"
2205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
2207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_raw),  // %0
2209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),  // %1
22107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(width)        // %2
2211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear filter 16x2 -> 16x1
2217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid InterpolateRow_NEON(uint8* dst_ptr,
2218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* src_ptr, ptrdiff_t src_stride,
2219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int dst_width, int source_y_fraction) {
22207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int y1_fraction = source_y_fraction;
2221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "cmp        %4, #0                         \n"
2223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "beq        100f                           \n"
2224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "add        %2, %1                         \n"
2225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "cmp        %4, #128                       \n"
2226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "beq        50f                            \n"
2227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.8     d5, %4                         \n"
2229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "rsb        %4, #256                       \n"
2230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.8     d4, %4                         \n"
2231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // General purpose row blend.
2232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"
2235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%2]!                    \n"
2237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"
2238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q13, d0, d4                    \n"
2239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q14, d1, d4                    \n"
2240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q13, d2, d5                    \n"
2241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q14, d3, d5                    \n"
2242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d0, q13, #8                    \n"
2243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d1, q14, #8                    \n"
2244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"
2246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "b          99f                            \n"
2248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 50 / 50.
2250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "50:                                         \n"
2251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"
2253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%2]!                    \n"
2255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"
2256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrhadd.u8  q0, q1                         \n"
2257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"
2259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        50b                            \n"
2260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "b          99f                            \n"
2261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 100 / 0 - Copy row unchanged.
2263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "100:                                        \n"
2264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%1]!                    \n"
2266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"
2267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%0]!                    \n"
2269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        100b                           \n"
2270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "99:                                         \n"
2272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(dst_ptr),          // %0
2273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_ptr),          // %1
2274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_stride),       // %2
2275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_width),        // %3
22767bc9febe8749e98a3812a0dc4380ceae75c29450Johann    "+r"(y1_fraction)       // %4
2277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
2279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint8* dst_argb, int width) {
2285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, #8                         \n"
2287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "blt        89f                            \n"
2288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 8 pixels.
2289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "8:                                          \n"
2290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
2292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
2294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q10, d4, d3                    \n"  // db * a
2296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q11, d5, d3                    \n"  // dg * a
2297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q12, d6, d3                    \n"  // dr * a
2298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
2299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
2300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
2301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
2302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
2303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
2304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d2, d2, d6                     \n"  // + sr
2305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // a = 255
2306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
2308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bge        8b                             \n"
2309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "89:                                         \n"
2311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "adds       %3, #8-1                       \n"
2312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "blt        99f                            \n"
2313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 1 pixels.
2315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
2318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
2320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
2321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q10, d4, d3                    \n"  // db * a
2322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q11, d5, d3                    \n"  // dg * a
2323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q12, d6, d3                    \n"  // dr * a
2324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
2325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
2326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
2327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
2328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
2329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
2330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d2, d2, d6                     \n"  // + sr
2331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // a = 255
2332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
2334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bge        1b                             \n"
2335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "99:                                         \n"
2337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb0),    // %0
2339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_argb1),    // %1
2340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),     // %2
2341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)         // %3
2342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
2344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Attenuate 8 pixels at a time.
2348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Attenuate 8 pixels.
2351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
2354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q10, d0, d3                    \n"  // b * a
2356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q11, d1, d3                    \n"  // g * a
2357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q12, d2, d3                    \n"  // r * a
2358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
2359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
2360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
2361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
2363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),   // %0
2365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %1
2366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %2
2367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
2369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Quantize 8 ARGB pixels (32 bytes).
2373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// dst = (dst * scale >> 16) * interval_size + interval_offset;
2374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int interval_offset, int width) {
2376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.u16   q8, %2                         \n"
2378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
2379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.u16   q9, %3                         \n"  // interval multiply.
2380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.u16   q10, %4                        \n"  // interval add
2381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
2386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
2387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
2388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q1, d2                         \n"
2389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q2, d4                         \n"
2390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
2391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqdmulh.s16 q1, q1, q8                    \n"  // g
2392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqdmulh.s16 q2, q2, q8                    \n"  // r
2393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
2394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.u16   q1, q1, q9                     \n"  // g
2395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.u16   q2, q2, q9                     \n"  // r
2396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
2397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q1, q1, q10                    \n"  // g
2398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.u16   q2, q2, q10                    \n"  // r
2399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d0, q0                         \n"
2400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d2, q1                         \n"
2401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d4, q2                         \n"
2402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
2404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(dst_argb),       // %0
2406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)           // %1
2407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(scale),           // %2
2408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "r"(interval_size),   // %3
2409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "r"(interval_offset)  // %4
2410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
2411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shade 8 pixels at a time by specified value.
2415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint32 value) {
2419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
2421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
2422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
2423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
2428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
2430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q11, d22                       \n"
2431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q12, d24                       \n"
2432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q13, d26                       \n"
2433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
2434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
2435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
2436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
2437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d20, q10                       \n"
2438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d22, q11                       \n"
2439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d24, q12                       \n"
2440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d26, q13                       \n"
2441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
2443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),       // %0
2445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),       // %1
2446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)           // %2
2447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(value)            // %3
2448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
2449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Similar to ARGBToYJ but stores ARGB.
2454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
2458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
2459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
2460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d24                    \n"  // B
2465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d25                    \n"  // G
2466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d26                    \n"  // R
2467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
2468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov       d1, d0                         \n"  // G
2469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov       d2, d0                         \n"  // R
2470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
2472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),  // %0
2474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),  // %1
2475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)      // %2
2476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
2478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    b = (r * 35 + g * 68 + b * 17) >> 7
2483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    g = (r * 45 + g * 88 + b * 22) >> 7
2484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    r = (r * 50 + g * 98 + b * 24) >> 7
2485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d20, #17                       \n"  // BB coefficient
2488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d21, #68                       \n"  // BG coefficient
2489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d22, #35                       \n"  // BR coefficient
2490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d24, #22                       \n"  // GB coefficient
2491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d25, #88                       \n"  // GG coefficient
2492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d26, #45                       \n"  // GR coefficient
2493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d28, #24                       \n"  // BB coefficient
2494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d29, #98                       \n"  // BG coefficient
2495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d30, #50                       \n"  // BR coefficient
2496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
2499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
2500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
2501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d1, d21                    \n"  // G
2502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q2, d2, d22                    \n"  // R
2503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
2504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q3, d1, d25                    \n"  // G
2505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q3, d2, d26                    \n"  // R
2506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
2507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d1, d29                    \n"  // G
2508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmlal.u8   q8, d2, d30                    \n"  // R
2509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
2510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
2511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
2512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
2514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(dst_argb),  // %0
2516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)      // %1
2517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3",
2519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q10", "q11", "q12", "q13", "q14", "q15"
2520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// needs to saturate.  Consider doing a non-saturating version.
2526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                             const int8* matrix_argb, int width) {
2528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
2530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
2531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
2532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
2533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
2537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
2539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q9, d18                        \n"  // g
2540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmovl.u8   q10, d20                       \n"  // r
2541da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmovl.u8   q11, d22                       \n"  // a
2542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
2543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
2544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
2545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
2546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
2547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
2548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
2549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
2550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
2551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
2552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
2555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
2556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
2557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
2558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
2559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
2560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2562da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
2563da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
2564da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
2565da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
2566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
2567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
2568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
2571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
2572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
2573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
2574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
2576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb),   // %0
2578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %1
2579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %2
2580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(matrix_argb)  // %3
2581da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
2582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "q10", "q11", "q12", "q13", "q14", "q15"
2583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_argb, int width) {
2589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
2594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
2596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q0, d0, d1                     \n"  // multiply B
2598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q1, d2, d3                     \n"  // multiply G
2599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q2, d4, d5                     \n"  // multiply R
2600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmull.u8   q3, d6, d7                     \n"  // multiply A
2601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
2602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
2603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
2604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
2605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb0),  // %0
2610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_argb1),  // %1
2611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %2
2612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %3
2613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"
2615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Add 2 rows of ARGB pixels together, 8 pixels at a time.
2619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
2621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
2628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
2630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
2631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb0),  // %0
2636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_argb1),  // %1
2637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %2
2638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %3
2639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"
2641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_argb, int width) {
2647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
2654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
2656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
2657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_argb0),  // %0
2662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_argb1),  // %1
2663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),   // %2
2664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)       // %3
2665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1", "q2", "q3"
2667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// A = 255
2672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// R = Sobel
2673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G = Sobel
2674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// B = Sobel
2675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
2677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // alpha
2679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
2683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
2685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d0, d0, d1                     \n"  // add
2687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d1, d0                         \n"
2688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d2, d0                         \n"
2689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_sobelx),  // %0
2693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_sobely),  // %1
2694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %2
2695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %3
2696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"
2698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Adds Sobel X and Sobel Y and stores Sobel into plane.
2702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_y, int width) {
2704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 16 pixel loop.
2706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
2709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
2711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
2712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   q0, q0, q1                     \n"  // add
2713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
2715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_sobelx),  // %0
2717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_sobely),  // %1
2718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_y),       // %2
2719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %3
2720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"
2722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Mixes Sobel X, Sobel Y and Sobel into ARGB.
2726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// A = 255
2727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// R = Sobel X
2728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G = Sobel
2729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// B = Sobel Y
2730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
2732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vmov.u8    d3, #255                       \n"  // alpha
2734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 8 pixel loop.
2735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
2738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
2740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqadd.u8   d1, d0, d2                     \n"  // add
2742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
2744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_sobelx),  // %0
2746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_sobely),  // %1
2747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_argb),    // %2
2748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %3
2749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  :
2750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"
2751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// SobelX as a matrix is
2755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1  0  1
2756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -2  0  2
2757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1  0  1
2758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
2759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    const uint8* src_y2, uint8* dst_sobelx, int width) {
2760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0],%5                  \n"  // top
2764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d1}, [%0],%6                  \n"
2766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q0, d0, d1                     \n"
2767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
2769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%1],%6                  \n"
2771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q1, d2, d3                     \n"
2772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
2773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
2774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
2776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%2],%6                  \n"
2778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %4, %4, #8                     \n"  // 8 pixels
2779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q1, d2, d3                     \n"
2780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
2781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vabs.s16   q0, q0                         \n"
2782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d0, q0                         \n"
2783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(3)
2784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
2785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_y0),      // %0
2787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_y1),      // %1
2788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_y2),      // %2
2789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_sobelx),  // %3
2790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %4
2791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(2),            // %5
2792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "r"(6)             // %6
2793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
2794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// SobelY as a matrix is
2798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1 -2 -1
2799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//  0  0  0
2800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//  1  2  1
2801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
2802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    uint8* dst_sobely, int width) {
2803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  asm volatile (
2804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  "1:                                          \n"
2805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d0}, [%0],%4                  \n"  // left
2807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d1}, [%1],%4                  \n"
2809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q0, d0, d1                     \n"
2810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
2812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%1],%4                  \n"
2814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q1, d2, d3                     \n"
2815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
2816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
2817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(0)
2818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d2}, [%0],%5                  \n"  // right
2819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(1)
2820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vld1.8     {d3}, [%1],%5                  \n"
2821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "subs       %3, %3, #8                     \n"  // 8 pixels
2822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vsubl.u8   q1, d2, d3                     \n"
2823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vadd.s16   q0, q0, q1                     \n"
2824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vabs.s16   q0, q0                         \n"
2825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vqmovn.u16 d0, q0                         \n"
2826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    MEMACCESS(2)
2827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
2828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "bgt        1b                             \n"
2829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "+r"(src_y0),      // %0
2830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(src_y1),      // %1
2831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(dst_sobely),  // %2
2832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "+r"(width)        // %3
2833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "r"(1),            // %4
2834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    "r"(6)             // %5
2835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  : "cc", "memory", "q0", "q1"  // Clobber List
2836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  );
2837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2838da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
2839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus
2841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}  // extern "C"
2842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}  // namespace libyuv
2843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
2844