133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp/*
233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *
433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Use of this source code is governed by a BSD-style license
533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  that can be found in the LICENSE file in the root of the source
633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  tree. An additional intellectual property rights grant can be found
733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  in the file PATENTS.  All contributing project authors may
833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  be found in the AUTHORS file in the root of the source tree.
933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp */
1033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h"
1233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampnamespace libyuv {
1533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampextern "C" {
1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
1733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// This module is for GCC Neon
1933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
2033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 8 Y, 4 U and 4 V from 422
2233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV422                                                             \
2333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d0}, [%0]!                    \n"                             \
2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u32   {d2[1]}, [%2]!                 \n"
2633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
2733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 8 Y and 4 UV from NV12
2833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READNV12                                                               \
2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d0}, [%0]!                    \n"                             \
3033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d2}, [%1]!                    \n"                             \
3133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
3233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vuzp.u8    d2, d3                         \n"                             \
3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u32   d2, d3                         \n"                             \
3433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
3533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 8 Y and 4 VU from NV21
3633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READNV21                                                               \
3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d0}, [%0]!                    \n"                             \
3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d2}, [%1]!                    \n"                             \
3933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
4033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vuzp.u8    d3, d2                         \n"                             \
4133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u32   d2, d3                         \n"                             \
4233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
4333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YUV422TORGB                                                            \
4433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
4533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
4733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
4833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8    d0, d1                         \n"                             \
4933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
5033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmul.s16   q0, q0, q14                    \n"                             \
5133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.s16   d18, d19                       \n"                             \
5233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqadd.s16  d20, d0, d16                   \n"                             \
5333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqadd.s16  d21, d1, d16                   \n"                             \
5433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqadd.s16  d22, d0, d17                   \n"                             \
5533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqadd.s16  d23, d1, d17                   \n"                             \
5633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqadd.s16  d16, d0, d18                   \n"                             \
5733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqadd.s16  d17, d1, d18                   \n"                             \
5833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrun.s16 d0, q10, #6                  \n"                             \
5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrun.s16 d1, q11, #6                  \n"                             \
6033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrun.s16 d2, q8, #6                   \n"                             \
6133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
6233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8   q11, d1                        \n"                             \
6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8   q8, d2                         \n"                             \
6433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8    d20, d21                       \n"                             \
6533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8    d22, d23                       \n"                             \
6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8    d16, d17                       \n"                             \
6733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d21, d16                       \n"
6833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
6933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) ||      \
7033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON)
7133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
7233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               0, 0, 0, 0, 0, 0, 0, 0 };
7333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
7433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                             0, 0, 0, 0, 0, 0, 0, 0 };
7533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
7633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
7733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TOARGBROW_NEON
7833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToARGBRow_NEON(const uint8* y_buf,
7933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* u_buf,
8033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* v_buf,
8133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* rgb_buf,
8233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        int width) {
8333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
8433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d24}, [%5]                    \n"
8533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d25}, [%6]                    \n"
8633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d26, #128                      \n"
8733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q14, #74                       \n"
8833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q15, #16                       \n"
8933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
9033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
9133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
9233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUV422TORGB
9333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %4, %4, #8                     \n"
9433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d23, #255                      \n"
9533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
9633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
9733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(y_buf),    // %0
9833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(u_buf),    // %1
9933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(v_buf),    // %2
10033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(rgb_buf),  // %3
10133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)     // %4
10233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kUVToRB),  // %5
10333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kUVToG)    // %6
10433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "cc", "memory", "q0", "q1", "q2", "q3",
10533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
10633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
10733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
10833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_I422TOARGBROW_NEON
10933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
11033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TOBGRAROW_NEON
11133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToBGRARow_NEON(const uint8* y_buf,
11233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* u_buf,
11333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* v_buf,
11433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* rgb_buf,
11533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        int width) {
11633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
11733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d24}, [%5]                    \n"
11833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d25}, [%6]                    \n"
11933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d26, #128                      \n"
12033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q14, #74                       \n"
12133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q15, #16                       \n"
12233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
12333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
12433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
12533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUV422TORGB
12633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %4, %4, #8                     \n"
12733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vswp.u8    d20, d22                       \n"
12833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d19, #255                      \n"
12933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
13033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
13133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(y_buf),    // %0
13233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(u_buf),    // %1
13333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(v_buf),    // %2
13433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(rgb_buf),  // %3
13533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)     // %4
13633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kUVToRB),  // %5
13733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kUVToG)    // %6
13833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "cc", "memory", "q0", "q1", "q2", "q3",
13933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
14033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
14133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
14233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_I422TOBGRAROW_NEON
14333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
14433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TOABGRROW_NEON
14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToABGRRow_NEON(const uint8* y_buf,
14633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* u_buf,
14733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* v_buf,
14833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* rgb_buf,
14933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        int width) {
15033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
15133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d24}, [%5]                    \n"
15233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d25}, [%6]                    \n"
15333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d26, #128                      \n"
15433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q14, #74                       \n"
15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q15, #16                       \n"
15633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
15733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
15833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
15933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUV422TORGB
16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %4, %4, #8                     \n"
16133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vswp.u8    d20, d22                       \n"
16233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d23, #255                      \n"
16333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
16433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
16533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(y_buf),    // %0
16633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(u_buf),    // %1
16733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(v_buf),    // %2
16833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(rgb_buf),  // %3
16933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)     // %4
17033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kUVToRB),  // %5
17133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kUVToG)    // %6
17233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "cc", "memory", "q0", "q1", "q2", "q3",
17333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
17433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
17533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_I422TOABGRROW_NEON
17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TORGBAROW_NEON
17933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToRGBARow_NEON(const uint8* y_buf,
18033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* u_buf,
18133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* v_buf,
18233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* rgb_buf,
18333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        int width) {
18433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
18533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d24}, [%5]                    \n"
18633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d25}, [%6]                    \n"
18733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d26, #128                      \n"
18833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q14, #74                       \n"
18933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q15, #16                       \n"
19033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
19133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
19233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
19333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUV422TORGB
19433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %4, %4, #8                     \n"
19533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d19, #255                      \n"
19633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
19733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
19833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(y_buf),    // %0
19933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(u_buf),    // %1
20033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(v_buf),    // %2
20133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(rgb_buf),  // %3
20233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)     // %4
20333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kUVToRB),  // %5
20433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kUVToG)    // %6
20533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "cc", "memory", "q0", "q1", "q2", "q3",
20633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
20733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
20833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
20933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_I422TORGBAROW_NEON
21033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
21133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TORGB24ROW_NEON
21233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToRGB24Row_NEON(const uint8* y_buf,
21333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* u_buf,
21433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* v_buf,
21533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* rgb_buf,
21633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        int width) {
21733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
21833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d24}, [%5]                    \n"
21933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d25}, [%6]                    \n"
22033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d26, #128                      \n"
22133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q14, #74                       \n"
22233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q15, #16                       \n"
22333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
22433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
22533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
22633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUV422TORGB
22733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %4, %4, #8                     \n"
22833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst3.8     {d20, d21, d22}, [%3]!         \n"
22933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
23033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(y_buf),    // %0
23133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(u_buf),    // %1
23233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(v_buf),    // %2
23333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(rgb_buf),  // %3
23433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)     // %4
23533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kUVToRB),  // %5
23633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kUVToG)    // %6
23733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "cc", "memory", "q0", "q1", "q2", "q3",
23833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
23933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
24033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
24133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_I422TORGB24ROW_NEON
24233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
24333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TORAWROW_NEON
24433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToRAWRow_NEON(const uint8* y_buf,
24533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       const uint8* u_buf,
24633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       const uint8* v_buf,
24733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* rgb_buf,
24833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       int width) {
24933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
25033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d24}, [%5]                    \n"
25133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d25}, [%6]                    \n"
25233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d26, #128                      \n"
25333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q14, #74                       \n"
25433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q15, #16                       \n"
25533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
25633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
25733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
25833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUV422TORGB
25933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %4, %4, #8                     \n"
26033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vswp.u8    d20, d22                       \n"
26133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst3.8     {d20, d21, d22}, [%3]!         \n"
26233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
26333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(y_buf),    // %0
26433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(u_buf),    // %1
26533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(v_buf),    // %2
26633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(rgb_buf),  // %3
26733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)     // %4
26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kUVToRB),  // %5
26933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kUVToG)    // %6
27033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "cc", "memory", "q0", "q1", "q2", "q3",
27133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
27233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
27433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_I422TORAWROW_NEON
27533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
27633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_NV12TOARGBROW_NEON
27733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV12ToARGBRow_NEON(const uint8* y_buf,
27833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* uv_buf,
27933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* rgb_buf,
28033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        int width) {
28133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
28233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d24}, [%4]                    \n"
28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d25}, [%5]                    \n"
28433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d26, #128                      \n"
28533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q14, #74                       \n"
28633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q15, #16                       \n"
28733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUV422TORGB
29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %3, %3, #8                     \n"
29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d23, #255                      \n"
29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(y_buf),    // %0
29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(uv_buf),   // %1
29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(rgb_buf),  // %2
29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)     // %3
29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kUVToRB),  // %4
30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kUVToG)    // %5
30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "cc", "memory", "q0", "q1", "q2", "q3",
30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_NV12TOARGBROW_NEON
30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_NV21TOARGBROW_NEON
30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV21ToARGBRow_NEON(const uint8* y_buf,
30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* uv_buf,
31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* rgb_buf,
31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        int width) {
31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d24}, [%4]                    \n"
31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {d25}, [%5]                    \n"
31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d26, #128                      \n"
31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q14, #74                       \n"
31733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u16   q15, #16                       \n"
31833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
31933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV21
32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUV422TORGB
32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %3, %3, #8                     \n"
32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d23, #255                      \n"
32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
32633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(y_buf),    // %0
32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(uv_buf),   // %1
32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(rgb_buf),  // %2
32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)     // %3
33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kUVToRB),  // %4
33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kUVToG)    // %5
33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "cc", "memory", "q0", "q1", "q2", "q3",
33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_NV21TOARGBROW_NEON
33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_SPLITUV_NEON
33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %3, %3, #16                    \n"  // 16 processed per loop
34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {q0}, [%1]!                    \n"  // store U
34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {q1}, [%2]!                    \n"  // Store V
34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_uv),  // %0
35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_u),   // %1
35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_v),   // %2
35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)    // %3  // Output registers
35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :                       // Input registers
35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "memory", "cc", "q0", "q1"  // Clobber List
35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_SPLITUV_NEON
35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COPYROW_NEON
36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Copy multiple of 64
36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CopyRow_NEON(const uint8* src, uint8* dst, int count) {
36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vldm       %0!, {q0, q1, q2, q3}          \n"  // load 64
36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #64                    \n"  // 64 processed per loop
36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vstm       %1!, {q0, q1, q2, q3}          \n"  // store 64
36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
37033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src),   // %0
37133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst),   // %1
37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(count)  // %2  // Output registers
37333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :                     // Input registers
37433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
37533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
37633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
37733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_COPYROW_NEON
37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_SETROW_NEON
38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// SetRow8 writes 'count' bytes using a 32 bit value repeated.
38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SetRow8_NEON(uint8* dst, uint32 v32, int count) {
38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (  // NOLINT
38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u32  {q0}, [%0]!                     \n"  // store
38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt       1b                              \n"
38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(dst),   // %0
38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(count)  // %1
39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(v32)     // %2
39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "q0", "memory", "cc");
39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Make fully assembler
39533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// SetRow32 writes 'count' words using a 32 bit value repeated.
39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SetRows32_NEON(uint8* dst, uint32 v32, int width,
39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                    int dst_stride, int height) {
39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int y = 0; y < height; ++y) {
39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    SetRow8_NEON(dst, v32, width << 2);
40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    dst += dst_stride;
40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_SETROW_NEON
40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_NEON
40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // compute where to start writing destination
40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add         %1, %2                        \n"
41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // work on segments that are multiples of 16
41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lsrs        r3, %2, #4                    \n"
41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // the output is written in two block. 8 bytes followed
41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // by another 8. reading is done sequentially, from left to
41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // right. writing is done from right to left in block sizes
41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // %1, the destination pointer is incremented after writing
41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // the first of the two blocks. need to subtract that 8 off
41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // along with 16 to get the next location.
41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov         r3, #-24                      \n"
41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "beq         2f                            \n"
42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // back of destination by the size of the register that is
42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // going to be mirrored
42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub         %1, #16                       \n"
42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // the loop needs to run on blocks of 16. what will be left
42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // over is either a negative number, the residuals that need
42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // to be done, or 0. If this isn't subtracted off here the
42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // loop will run one extra time.
42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub         %2, #16                       \n"
42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
43033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // mirror the bytes in the 64 bit segments. unable to mirror
43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // the bytes in the entire 128 bits in one go.
43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // because of the inability to mirror the entire 128 bits
43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // mirror the writing out of the two 64 bit segments.
43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.8      {q0}, [%0]!                   \n"  // src += 16
43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs        %2, #16                       \n"
43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrev64.8    q0, q0                        \n"
43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.8      {d1}, [%1]!                   \n"
44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.8      {d0}, [%1], r3                \n"  // dst -= 16
44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bge         1b                            \n"
44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // add 16 back to the counter. if the result is 0 there is no
44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // residuals so jump past
44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "adds        %2, #16                       \n"
44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "beq         5f                            \n"
44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add         %1, #16                       \n"
44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "2:                                          \n"
44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov         r3, #-3                       \n"
45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub         %1, #2                        \n"
45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs        %2, #2                        \n"
45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // check for 16*n+1 scenarios where segments_of_2 should not
45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // be run, but there is something left over.
45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "blt         4f                            \n"
45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// do this in neon registers as per
45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "3:                                          \n"
45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs        %2, #2                        \n"
46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.8      {d1[0]}, [%1]!                \n"
46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bge         3b                            \n"
46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "adds        %2, #2                        \n"
46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "beq         5f                            \n"
46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "4:                                          \n"
46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add         %1, #1                        \n"
46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.8      {d0[0]}, [%0]                 \n"
47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.8      {d0[0]}, [%1]                 \n"
47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "5:                                          \n"
47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src),   // %0
47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst),   // %1
47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)  // %2
47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "memory", "cc", "r3", "q0"
47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_MIRRORROW_NEON
48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROWUV_NEON
48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // compute where to start writing destination
48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add         %1, %3                        \n"  // dst_a + width
48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add         %2, %3                        \n"  // dst_b + width
48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // work on input segments that are multiples of 16, but
48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // width that has been passed is output segments, half
48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // the size of input.
49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lsrs        r12, %3, #3                   \n"
49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "beq         2f                            \n"
49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // the output is written in to two blocks.
49333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov         r12, #-8                      \n"
49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // back of destination by the size of the register that is
49533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // going to be mirrord
49633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub         %1, #8                        \n"
49733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub         %2, #8                        \n"
49833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // the loop needs to run on blocks of 8. what will be left
49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // over is either a negative number, the residuals that need
50033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // to be done, or 0. if this isn't subtracted off here the
50133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // loop will run one extra time.
50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub         %3, #8                        \n"
50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // mirror the bytes in the 64 bit segments
50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld2.8      {d0, d1}, [%0]!               \n"  // src += 16
50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs        %3, #8                        \n"
50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrev64.8    q0, q0                        \n"
51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.8      {d0}, [%1], r12               \n"  // dst_a -= 8
51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.8      {d1}, [%2], r12               \n"  // dst_b -= 8
51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bge         1b                            \n"
51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // add 8 back to the counter. if the result is 0 there is no
51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // residuals so return
51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "adds        %3, #8                        \n"
51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "beq         4f                            \n"
51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add         %1, #8                        \n"
51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add         %2, #8                        \n"
52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "2:                                          \n"
52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov         r12, #-1                      \n"
52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub         %1, #1                        \n"
52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub         %2, #1                        \n"
52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "3:                                          \n"
52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "subs        %3, %3, #1                  \n"
52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "bgt         3b                          \n"
53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "4:                                          \n"
53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src),    // %0
53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_a),  // %1
53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_b),  // %2
53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)   // %3
53533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
53633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "memory", "cc", "r12", "q0"
53733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
53833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
53933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_MIRRORROWUV_NEON
54033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
54133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_BGRATOARGBROW_NEON
54233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
54333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
54433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
54533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
54633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
54733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
54833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vswp.u8    d1, d2                         \n"  // swap G, R
54933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vswp.u8    d0, d3                         \n"  // swap B, A
55033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
55133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
55233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_bgra),  // %0
55333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %1
55433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
55533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
55633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
55733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
55833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
55933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_BGRATOARGBROW_NEON
56033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
56133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ABGRTOARGBROW_NEON
56233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
56333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
56433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
56533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
56633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
56733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
56833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vswp.u8    d0, d2                         \n"  // swap R, B
56933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
57033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
57133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_abgr),  // %0
57233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %1
57333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
57433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
57533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
57633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
57733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
57833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ABGRTOARGBROW_NEON
57933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
58033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_RGBATOARGBROW_NEON
58133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
58233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
58333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
58433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                           \n"
58533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.8     {d0, d1, d2, d3}, [%0]!         \n"  // load 8 pixels of RGBA.
58633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #8                      \n"  // 8 processed per loop.
58733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d4, d0                          \n"  // move A after RGB
58833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d1, d2, d3, d4}, [%1]!         \n"  // store 8 pixels of ARGB.
58933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                              \n"
59033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_rgba),  // %0
59133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %1
59233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
59333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
59433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d0", "d1", "d2", "d3", "d4"  // Clobber List
59533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
59633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
59733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_RGBATOARGBROW_NEON
59833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
59933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_RGB24TOARGBROW_NEON
60033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
60133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
60233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d4, #255                       \n"  // Alpha
60333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
60433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
60533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
60633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
60733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
60833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
60933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_rgb24),  // %0
61033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),   // %1
61133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)         // %2
61233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
61333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
61433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
61533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
61633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_RGB24TOARGBROW_NEON
61733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
61833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_RAWTOARGBROW_NEON
61933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
62033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
62133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d4, #255                       \n"  // Alpha
62233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
62333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
62433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
62533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
62633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vswp.u8    d1, d3                         \n"  // swap R, B
62733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
62833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
62933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_raw),   // %0
63033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %1
63133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
63233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
63333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
63433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
63533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
63633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_RAWTOARGBROW_NEON
63733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
63833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBTORGBAROW_NEON
63933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
64033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
64133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
64233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
64333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
64433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
64533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8    d0, d4                         \n"  // move A before RGB.
64633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of RGBA.
64733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
64833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),  // %0
64933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_rgba),  // %1
65033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
65133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
65233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d0", "d1", "d2", "d3", "d4"  // Clobber List
65333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
65433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
65533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBTORGBAROW_NEON
65633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
65733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBTORGB24ROW_NEON
65833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
65933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
66033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
66133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
66233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
66333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
66433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
66533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
66633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),   // %0
66733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_rgb24),  // %1
66833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)         // %2
66933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
67033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
67133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
67233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
67333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBTORGB24ROW_NEON
67433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
67533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBTORAWROW_NEON
67633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
67733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
67833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
67933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
68033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
68133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
68233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vswp.u8    d1, d3                         \n"  // swap R, B
68333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
68433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
68533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),  // %0
68633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_raw),   // %1
68733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
68833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
68933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d1", "d2", "d3", "d4"  // Clobber List
69033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
69133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
69233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBTORAWROW_NEON
69333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
69433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YUY2TOYROW_NEON
69533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
69633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
69733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
69833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
69933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
70033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
70133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {q0}, [%1]!                    \n"  // store 16 pixels of Y.
70233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
70333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),  // %0
70433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
70533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
70633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
70733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "q0", "q1"  // Clobber List
70833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
70933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
71033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_YUY2TOYROW_NEON
71133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
71233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_UYVYTOYROW_NEON
71333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
71433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
71533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
71633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
71733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
71833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
71933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {q1}, [%1]!                    \n"  // store 16 pixels of Y.
72033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
72133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),  // %0
72233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
72333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
72433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
72533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "q0", "q1"  // Clobber List
72633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
72733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
72833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_UYVYTOYROW_NEON
72933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
73033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YUY2TOYROW_NEON
73133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
73233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int pix) {
73333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
73433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
73533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
73633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
73733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
73833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {d1}, [%1]!                    \n"  // store 8 U.
73933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {d3}, [%2]!                    \n"  // store 8 V.
74033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
74133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),  // %0
74233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),     // %1
74333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),     // %2
74433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %3
74533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
74633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
74733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
74833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
74933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_YUY2TOYROW_NEON
75033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
75133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_UYVYTOYROW_NEON
75233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
75333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int pix) {
75433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
75533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
75633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
75733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
75833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
75933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {d0}, [%1]!                    \n"  // store 8 U.
76033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {d2}, [%2]!                    \n"  // store 8 V.
76133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
76233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),  // %0
76333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),     // %1
76433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),     // %2
76533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %3
76633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
76733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d0", "d1", "d2", "d3"  // Clobber List
76833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
76933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
77033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_UYVYTOYROW_NEON
77133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
77233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YUY2TOYROW_NEON
77333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
77433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                      uint8* dst_u, uint8* dst_v, int pix) {
77533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
77633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "adds       %1, %0, %1                     \n"  // stride + src_yuy2
77733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
77833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
77933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
78033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
78133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
78233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
78333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
78433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {d1}, [%2]!                    \n"  // store 8 U.
78533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {d3}, [%3]!                    \n"  // store 8 V.
78633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
78733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),     // %0
78833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(stride_yuy2),  // %1
78933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),        // %2
79033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),        // %3
79133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)           // %4
79233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
79333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
79433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
79533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
79633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_YUY2TOYROW_NEON
79733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
79833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_UYVYTOYROW_NEON
79933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
80033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                      uint8* dst_u, uint8* dst_v, int pix) {
80133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
80233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "adds       %1, %0, %1                     \n"  // stride + src_uyvy
80333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
80433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
80533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
80633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
80733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
80833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
80933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
81033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {d0}, [%2]!                    \n"  // store 8 U.
81133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {d2}, [%3]!                    \n"  // store 8 V.
81233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
81333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),     // %0
81433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(stride_uyvy),  // %1
81533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),        // %2
81633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),        // %3
81733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)           // %4
81833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
81933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
82033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
82133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
82233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_UYVYTOYROW_NEON
82333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
82433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // __ARM_NEON__
82533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
82633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
82733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // extern "C"
82833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // namespace libyuv
82933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
830