133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp/*
233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *
433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Use of this source code is governed by a BSD-style license
533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  that can be found in the LICENSE file in the root of the source
633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  tree. An additional intellectual property rights grant can be found
733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  in the file PATENTS.  All contributing project authors may
833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  be found in the AUTHORS file in the root of the source tree.
933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp */
1033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/basic_types.h"
1233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h"
1333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
1533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampnamespace libyuv {
1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampextern "C" {
1733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// This module is for GCC Neon
2033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
2233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp/**
2333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * NEON downscalers with interpolation.
2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *
2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * Provided by Fritz Koenig
2633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *
2733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp */
2833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
3033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* dst, int dst_width) {
3133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
3233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // load even pixels into q0, odd into q1
3433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld2.u8    {q0,q1}, [%0]!                 \n"
3533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
3633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, %2, #16                    \n"  // 16 processed per loop
3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
3933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst),              // %1
4033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width)         // %2
4133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
4233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "q0", "q1"              // Clobber List
4333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
4433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
4533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
4733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                           uint8* dst, int dst_width) {
4833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
4933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // change the stride to row 2 pointer
5033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add        %1, %0                         \n"
5133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
5233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post inc
5333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post inc
5433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
5533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8  q1, q1                         \n"
5633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
5733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpadal.u8  q1, q3                         \n"
5833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrshrn.u16 d1, q1, #2                     \n"
6033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8    {q0}, [%2]!                    \n"
6133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %3, %3, #16                    \n"  // 16 processed per loop
6233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
6433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(src_stride),       // %1
6533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst),              // %2
6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width)         // %3
6733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
6833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "q0", "q1", "q2", "q3"     // Clobber List
6933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp   );
7033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
7133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
7233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
7333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* dst_ptr, int dst_width) {
7433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
7533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
7633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld2.u8    {d0, d1}, [%0]!                \n"
7733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8    d1, d0                         \n"
7833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vshrn.u16  d0, q0, #8                     \n"
7933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u32   {d0[1]}, [%1]!                 \n"
8033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, #4                         \n"
8133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
8233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
8333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_ptr),          // %1
8433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width)         // %2
8533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
8633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "q0", "q1", "memory", "cc"
8733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
8833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
8933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
9033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
9133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                           uint8* dst_ptr, int dst_width) {
9233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
9333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add        r4, %0, %3                     \n"
9433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add        r5, r4, %3                     \n"
9533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add        %3, r5, %3                     \n"
9633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
9733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4
9833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {q1}, [r4]!                    \n"
9933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {q2}, [r5]!                    \n"
10033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8    {q3}, [%3]!                    \n"
10133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8  q0, q0                         \n"
10233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpadal.u8  q0, q1                         \n"
10333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpadal.u8  q0, q2                         \n"
10433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpadal.u8  q0, q3                         \n"
10533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u16 q0, q0                         \n"
10633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
10733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovn.u16  d0, q0                         \n"
10833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u32   {d0[0]}, [%1]!                 \n"
10933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs       %2, #4                         \n"
11033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt        1b                             \n"
11133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
11233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_ptr),          // %1
11333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width)         // %2
11433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(src_stride)         // %3
11533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
11633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
11733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
11833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
11933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Down scale from 4 to 3 pixels. Use the neon multilane read/write
12033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// to load up the every 4th pixel into a 4 different registers.
12133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Point samples 32 pixels to 24 pixels.
12233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown34_NEON(const uint8* src_ptr,
12333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         ptrdiff_t /* src_stride */,
12433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* dst_ptr, int dst_width) {
12533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
12633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
12733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
12833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov         d2, d3                       \n" // order d0, d1, d2
12933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
13033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %2, #24                      \n"
13133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          1b                           \n"
13233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
13333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_ptr),          // %1
13433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width)         // %2
13533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
13633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "d0", "d1", "d2", "d3", "memory", "cc"
13733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
13833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
13933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
14033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
14133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               ptrdiff_t src_stride,
14233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               uint8* dst_ptr, int dst_width) {
14333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
14433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8      d24, #3                      \n"
14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add          %3, %0                       \n"
14633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
14733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
14833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
14933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
15033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // filter src line 0 with src line 1
15133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // expand chars to shorts to allow for room
15233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // when adding lines together
15333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q8, d4                       \n"
15433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q9, d5                       \n"
15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q10, d6                      \n"
15633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q11, d7                      \n"
15733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
15833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 3 * line_0 + line_1
15933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q8, d0, d24                  \n"
16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q9, d1, d24                  \n"
16133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q10, d2, d24                 \n"
16233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q11, d3, d24                 \n"
16333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
16433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // (3 * line_0 + line_1) >> 2
16533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d0, q8, #2                   \n"
16633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d1, q9, #2                   \n"
16733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d2, q10, #2                  \n"
16833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d3, q11, #2                  \n"
16933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
17033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // a0 = (src[0] * 3 + s[1] * 1) >> 2
17133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q8, d1                       \n"
17233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q8, d0, d24                  \n"
17333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d0, q8, #2                   \n"
17433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
17533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // a1 = (src[1] * 1 + s[2] * 1) >> 1
17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8    d1, d1, d2                   \n"
17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // a2 = (src[2] * 1 + s[3] * 3) >> 2
17933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q8, d2                       \n"
18033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q8, d3, d24                  \n"
18133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d2, q8, #2                   \n"
18233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
18333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
18433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
18533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %2, #24                      \n"
18633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          1b                           \n"
18733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
18833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_ptr),          // %1
18933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width),        // %2
19033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(src_stride)        // %3
19133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
19233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
19333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
19433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
19533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
19633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
19733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               ptrdiff_t src_stride,
19833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               uint8* dst_ptr, int dst_width) {
19933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
20033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8      d24, #3                      \n"
20133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add          %3, %0                       \n"
20233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
20333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
20433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
20533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
20633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // average src line 0 with src line 1
20733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8    q0, q0, q2                   \n"
20833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8    q1, q1, q3                   \n"
20933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
21033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // a0 = (src[0] * 3 + s[1] * 1) >> 2
21133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q3, d1                       \n"
21233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q3, d0, d24                  \n"
21333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d0, q3, #2                   \n"
21433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
21533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // a1 = (src[1] * 1 + s[2] * 1) >> 1
21633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8    d1, d1, d2                   \n"
21733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
21833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // a2 = (src[2] * 1 + s[3] * 3) >> 2
21933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q3, d2                       \n"
22033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q3, d3, d24                  \n"
22133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d2, q3, #2                   \n"
22233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
22333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
22433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
22533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %2, #24                      \n"
22633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          1b                           \n"
22733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
22833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_ptr),          // %1
22933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width),        // %2
23033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(src_stride)        // %3
23133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
23233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
23333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
23433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
23533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
23633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_SCALEROWDOWN38_NEON
23733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampconst uvec8 kShuf38 =
23833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
23933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampconst uvec8 kShuf38_2 =
24033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
24133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampconst vec16 kMult38_Div6 =
24233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
24333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
24433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampconst vec16 kMult38_Div9 =
24533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
24633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
24733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
24833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 32 -> 12
24933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown38_NEON(const uint8* src_ptr,
25033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         ptrdiff_t /* src_stride */,
25133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* dst_ptr, int dst_width) {
25233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
25333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q3}, [%3]                   \n"
25433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
25533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
25633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
25733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
25833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8      {d4}, [%1]!                  \n"
25933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u32     {d5[0]}, [%1]!               \n"
26033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %2, #12                      \n"
26133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          1b                           \n"
26233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
26333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_ptr),          // %1
26433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width)         // %2
26533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kShuf38)           // %3
26633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
26733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
26933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
27033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 32x3 -> 12x1
27133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
27233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                      ptrdiff_t src_stride,
27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                      uint8* dst_ptr, int dst_width) {
27433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
27533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u16     {q13}, [%4]                  \n"
27633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q14}, [%5]                  \n"
27733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q15}, [%6]                  \n"
27833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add          r4, %0, %3, lsl #1           \n"
27933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add          %3, %0                       \n"
28033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
28133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
28233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d0 = 00 40 01 41 02 42 03 43
28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d1 = 10 50 11 51 12 52 13 53
28433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d2 = 20 60 21 61 22 62 23 63
28533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d3 = 30 70 31 71 32 72 33 73
28633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
28733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Shuffle the input data around to get align the data
29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d0 = 00 10 01 11 02 12 03 13
29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d1 = 40 50 41 51 42 52 43 53
29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d0, d1                       \n"
29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d4, d5                       \n"
29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d16, d17                     \n"
29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d2 = 20 30 21 31 22 32 23 33
29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d3 = 60 70 61 71 62 72 63 73
30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d2, d3                       \n"
30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d6, d7                       \n"
30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d18, d19                     \n"
30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d0 = 00+10 01+11 02+12 03+13
30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d2 = 40+50 41+51 42+52 43+53
30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    q0, q0                       \n"
30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    q2, q2                       \n"
30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    q8, q8                       \n"
30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d3 = 60+70 61+71 62+72 63+73
31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    d3, d3                       \n"
31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    d7, d7                       \n"
31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    d19, d19                     \n"
31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // combine source lines
31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     q0, q2                       \n"
31733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     q0, q8                       \n"
31833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     d4, d3, d7                   \n"
31933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     d4, d19                      \n"
32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //             + s[6 + st * 1] + s[7 + st * 1]
32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrdmulh.s16 q2, q2, q13                  \n"
32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovn.u16    d4, q2                       \n"
32633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Shuffle 2,3 reg around so that 2 can be added to the
32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  0,1 reg and 3 can be added to the 4,5 reg. This
32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  requires expanding from u8 to u16 as the 0,1 and 4,5
33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  registers are already expanded. Then do transposes
33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  to get aligned.
33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q1, d2                       \n"
33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q3, d6                       \n"
33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q9, d18                      \n"
33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // combine source lines
33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     q1, q3                       \n"
33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     q1, q9                       \n"
34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d4 = xx 20 xx 30 xx 22 xx 32
34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d5 = xx 21 xx 31 xx 23 xx 33
34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u32     d2, d3                       \n"
34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d4 = xx 20 xx 21 xx 22 xx 23
34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d5 = xx 30 xx 31 xx 32 xx 33
34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u16     d2, d3                       \n"
34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 0+1+2, 3+4+5
35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     q0, q1                       \n"
35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Need to divide, but can't downshift as the the value
35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  isn't a power of 2. So multiply by 65536 / n
35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  and take the upper 16 bits.
35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrdmulh.s16 q0, q0, q15                  \n"
35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Align for table lookup, vtbl requires registers to
35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  be adjacent
35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8      d2, d4                       \n"
36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8      {d3}, [%1]!                  \n"
36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u32     {d4[0]}, [%1]!               \n"
36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %2, #12                      \n"
36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          1b                           \n"
36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),          // %0
36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_ptr),          // %1
37033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width),        // %2
37133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(src_stride)        // %3
37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kMult38_Div6),     // %4
37333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kShuf38_2),        // %5
37433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kMult38_Div9)      // %6
37533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
37633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "q13", "q14", "q15", "memory", "cc"
37733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 32x2 -> 12x1
38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               ptrdiff_t src_stride,
38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               uint8* dst_ptr, int dst_width) {
38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u16     {q13}, [%4]                  \n"
38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q14}, [%5]                  \n"
38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add          %3, %0                       \n"
38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d0 = 00 40 01 41 02 42 03 43
39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d1 = 10 50 11 51 12 52 13 53
39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d2 = 20 60 21 61 22 62 23 63
39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d3 = 30 70 31 71 32 72 33 73
39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
39533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Shuffle the input data around to get align the data
39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d0 = 00 10 01 11 02 12 03 13
40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d1 = 40 50 41 51 42 52 43 53
40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d0, d1                       \n"
40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d4, d5                       \n"
40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d2 = 20 30 21 31 22 32 23 33
40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d3 = 60 70 61 71 62 72 63 73
40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d2, d3                       \n"
40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u8      d6, d7                       \n"
40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d0 = 00+10 01+11 02+12 03+13
41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d2 = 40+50 41+51 42+52 43+53
41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    q0, q0                       \n"
41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    q2, q2                       \n"
41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d3 = 60+70 61+71 62+72 63+73
41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    d3, d3                       \n"
41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vpaddl.u8    d7, d7                       \n"
41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // combine source lines
41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     q0, q2                       \n"
42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     d4, d3, d7                   \n"
42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrshrn.u16  d4, q2, #2                   \n"
42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Shuffle 2,3 reg around so that 2 can be added to the
42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  0,1 reg and 3 can be added to the 4,5 reg. This
42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  requires expanding from u8 to u16 as the 0,1 and 4,5
42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  registers are already expanded. Then do transposes
42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  to get aligned.
43033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q1, d2                       \n"
43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmovl.u8     q3, d6                       \n"
43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // combine source lines
43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     q1, q3                       \n"
43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d4 = xx 20 xx 30 xx 22 xx 32
43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d5 = xx 21 xx 31 xx 23 xx 33
43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u32     d2, d3                       \n"
44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d4 = xx 20 xx 21 xx 22 xx 23
44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // d5 = xx 30 xx 31 xx 32 xx 33
44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtrn.u16     d2, d3                       \n"
44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 0+1+2, 3+4+5
44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vadd.u16     q0, q1                       \n"
44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Need to divide, but can't downshift as the the value
44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  isn't a power of 2. So multiply by 65536 / n
45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  and take the upper 16 bits.
45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vqrdmulh.s16 q0, q0, q13                  \n"
45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Align for table lookup, vtbl requires registers to
45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    //  be adjacent
45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmov.u8      d2, d4                       \n"
45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8      {d3}, [%1]!                  \n"
46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u32     {d4[0]}, [%1]!               \n"
46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %2, #12                      \n"
46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          1b                           \n"
46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src_ptr),       // %0
46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_ptr),       // %1
46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width),     // %2
46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(src_stride)     // %3
46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(&kMult38_Div6),  // %4
46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(&kShuf38_2)      // %5
47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 16x2 -> 16x1
47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleFilterRows_NEON(uint8* dst_ptr,
47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                          const uint8* src_ptr, ptrdiff_t src_stride,
47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                          int dst_width, int source_y_fraction) {
47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cmp          %4, #0                       \n"
48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "beq          2f                           \n"
48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add          %2, %1                       \n"
48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cmp          %4, #128                     \n"
48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "beq          3f                           \n"
48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vdup.8       d5, %4                       \n"
48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "rsb          %4, #256                     \n"
48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vdup.8       d4, %4                       \n"
48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q0}, [%1]!                  \n"
49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q1}, [%2]!                  \n"
49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %3, #16                      \n"
49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmull.u8     q13, d0, d4                  \n"
49333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmull.u8     q14, d1, d4                  \n"
49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q13, d2, d5                  \n"
49533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vmlal.u8     q14, d3, d5                  \n"
49633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrshrn.u16   d0, q13, #8                  \n"
49733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrshrn.u16   d1, q14, #8                  \n"
49833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8      {q0}, [%0]!                  \n"
49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          1b                           \n"
50033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "b            4f                           \n"
50133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "2:                                        \n"
50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q0}, [%1]!                  \n"
50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %3, #16                      \n"
50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8      {q0}, [%0]!                  \n"
50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          2b                           \n"
50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "b            4f                           \n"
50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "3:                                        \n"
51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q0}, [%1]!                  \n"
51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vld1.u8      {q1}, [%2]!                  \n"
51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "subs         %3, #16                      \n"
51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vrhadd.u8    q0, q1                       \n"
51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8      {q0}, [%0]!                  \n"
51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "bgt          3b                           \n"
51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "4:                                        \n"
51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "vst1.u8      {d1[7]}, [%0]                \n"
51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(dst_ptr),          // %0
51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(src_ptr),          // %1
52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(src_stride),       // %2
52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst_width),        // %3
52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(source_y_fraction) // %4
52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    :
52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // __ARM_NEON__
52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // extern "C"
53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // namespace libyuv
53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
535