1f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang/*
2f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *
4f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Use of this source code is governed by a BSD-style license
5f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  that can be found in the LICENSE file in the root of the source
6f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  tree. An additional intellectual property rights grant can be found
7f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  in the file PATENTS. All contributing project authors may
8f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  be found in the AUTHORS file in the root of the source tree.
9f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang */
10f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
11f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include "libyuv/row.h"
12cead1e07666bcc5914f8927712c2f89b9b789f9bFrank Barchard#include "libyuv/scale.h"
13f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include "libyuv/scale_row.h"
14f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
15f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
16f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangnamespace libyuv {
17f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangextern "C" {
18f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
19f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
20f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// This module is for GCC Neon armv8 64 bit.
21f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
23f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 32x1 throw away even pixels, and write 16x1.
24b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleRowDown2_NEON(const uint8* src_ptr,
25b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        ptrdiff_t src_stride,
26b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst,
27b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int dst_width) {
28b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  (void)src_stride;
29f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
30f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
31f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // load even pixels into v0, odd into v1
32f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
33f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
34f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
35f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
36f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
37f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
38f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
39f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),              // %1
40f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)         // %2
41f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
42f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1"              // Clobber List
43f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
44f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
45f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
46f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 32x1 average down and write 16x1.
47b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleRowDown2Linear_NEON(const uint8* src_ptr,
48b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              ptrdiff_t src_stride,
49b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              uint8* dst,
50b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              int dst_width) {
51b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  (void)src_stride;
52f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
53f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
54f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
55f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
56f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
57f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
58f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v1.8h, v1.16b                  \n"
59f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
60f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn2     v0.16b, v1.8h, #1              \n"
61f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
62f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1        {v0.16b}, [%1], #16            \n"
63f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
64f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
65f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),              // %1
66f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)         // %2
67f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
68f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1"     // Clobber List
69f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
70f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
71f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
72f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 32x2 average down and write 16x1.
73b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleRowDown2Box_NEON(const uint8* src_ptr,
74b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           ptrdiff_t src_stride,
75b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           uint8* dst,
76b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           int dst_width) {
77f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
78f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // change the stride to row 2 pointer
79f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        %1, %1, %0                     \n"
80f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
81f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
82f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
83f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
84f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
85f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
86f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
87f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v1.8h, v1.16b                  \n"
88f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
89f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp     v1.8h, v3.16b                  \n"
90f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
91f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn2     v0.16b, v1.8h, #2              \n"
92f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(2)
93f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1        {v0.16b}, [%2], #16            \n"
94f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
95f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
96f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_stride),       // %1
97f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),              // %2
98f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)         // %3
99f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3"     // Clobber List
101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
104b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleRowDown4_NEON(const uint8* src_ptr,
105b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        ptrdiff_t src_stride,
106b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst_ptr,
107b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int dst_width) {
108b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  (void)src_stride;
109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1     {v2.8b}, [%1], #8                 \n"
116f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
118f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),          // %1
119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)         // %2
120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "memory", "cc"
122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
125b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleRowDown4Box_NEON(const uint8* src_ptr,
126b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           ptrdiff_t src_stride,
127b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           uint8* dst_ptr,
128b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           int dst_width) {
129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint8* src_ptr1 = src_ptr + src_stride;
130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint8* src_ptr2 = src_ptr + src_stride * 2;
131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint8* src_ptr3 = src_ptr + src_stride * 3;
132b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
134f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
135f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(3)
137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1     {v1.16b}, [%2], #16               \n"
138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(4)
139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1     {v2.16b}, [%3], #16               \n"
140f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(5)
141f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1     {v3.16b}, [%4], #16               \n"
142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs    %w5, %w5, #4                      \n"
143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp  v0.8h, v0.16b                     \n"
144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp  v0.8h, v1.16b                     \n"
145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp  v0.8h, v2.16b                     \n"
146f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp  v0.8h, v3.16b                     \n"
147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addp    v0.8h, v0.8h, v0.8h               \n"
148f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
149f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1    {v0.s}[0], [%1], #4                \n"
151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),   // %0
153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),   // %1
154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_ptr1),  // %2
155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_ptr2),  // %3
156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_ptr3),  // %4
157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)  // %5
158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "memory", "cc"
160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Down scale from 4 to 3 pixels. Use the neon multilane read/write
164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// to load up the every 4th pixel into a 4 different registers.
165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Point samples 32 pixels to 24 pixels.
166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ScaleRowDown34_NEON(const uint8* src_ptr,
167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                         ptrdiff_t src_stride,
168b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_ptr,
169b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int dst_width) {
170b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  (void)src_stride;
171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                                  \n"
173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
174b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs      %w2, %w2, #24                           \n"
176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
178b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt      1b                                      \n"
180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),          // %1
182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)         // %2
183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "memory", "cc"
185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               ptrdiff_t src_stride,
190b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               uint8* dst_ptr,
191b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               int dst_width) {
192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movi      v20.8b, #3                              \n"
194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       %3, %3, %0                              \n"
195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                                  \n"
196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
197b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(3)
199b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs         %w2, %w2, #24                        \n"
201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // filter src line 0 with src line 1
203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // expand chars to shorts to allow for room
204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // when adding lines together
205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v16.8h, v4.8b, #0                       \n"
206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v17.8h, v5.8b, #0                       \n"
207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v18.8h, v6.8b, #0                       \n"
208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v19.8h, v7.8b, #0                       \n"
209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 3 * line_0 + line_1
211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal     v16.8h, v0.8b, v20.8b                   \n"
212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal     v17.8h, v1.8b, v20.8b                   \n"
213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal     v18.8h, v2.8b, v20.8b                   \n"
214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal     v19.8h, v3.8b, v20.8b                   \n"
215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // (3 * line_0 + line_1) >> 2
217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v0.8b, v16.8h, #2                       \n"
218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v1.8b, v17.8h, #2                       \n"
219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v2.8b, v18.8h, #2                       \n"
220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v3.8b, v19.8h, #2                       \n"
221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // a0 = (src[0] * 3 + s[1] * 1) >> 2
223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v16.8h, v1.8b, #0                       \n"
224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal     v16.8h, v0.8b, v20.8b                   \n"
225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v0.8b, v16.8h, #2                       \n"
226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // a1 = (src[1] * 1 + s[2] * 1) >> 1
228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // a2 = (src[2] * 1 + s[3] * 3) >> 2
231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v16.8h, v2.8b, #0                       \n"
232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal     v16.8h, v3.8b, v20.8b                   \n"
233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v2.8b, v16.8h, #2                       \n"
234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt      1b                                      \n"
239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),          // %1
241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width),        // %2
242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_stride)        // %3
243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "v20", "memory", "cc"
246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               ptrdiff_t src_stride,
251b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               uint8* dst_ptr,
252b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               int dst_width) {
253f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movi      v20.8b, #3                              \n"
255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       %3, %3, %0                              \n"
256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                                  \n"
257f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
259f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(3)
260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs         %w2, %w2, #24                        \n"
262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // average src line 0 with src line 1
263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
266f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
267f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
268f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // a0 = (src[0] * 3 + s[1] * 1) >> 2
269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v4.8h, v1.8b, #0                        \n"
270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal     v4.8h, v0.8b, v20.8b                    \n"
271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v0.8b, v4.8h, #2                        \n"
272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // a1 = (src[1] * 1 + s[2] * 1) >> 1
274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // a2 = (src[2] * 1 + s[3] * 3) >> 2
277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v4.8h, v2.8b, #0                        \n"
278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal     v4.8h, v3.8b, v20.8b                    \n"
279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v2.8b, v4.8h, #2                        \n"
280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt      1b                                      \n"
284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),          // %1
286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width),        // %2
287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_stride)        // %3
288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
292f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
293b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
294b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
295b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          34, 6,  22, 35, 0,  0,  0, 0};
296b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
297b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                             65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
298b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
299b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                             65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 32 -> 12
302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ScaleRowDown38_NEON(const uint8* src_ptr,
303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                         ptrdiff_t src_stride,
304b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_ptr,
305b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int dst_width) {
306b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  (void)src_stride;
307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
308f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(3)
309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1       {v3.16b}, [%3]                          \n"
310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                                  \n"
311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
312b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs      %w2, %w2, #12                           \n"
314b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1       {v2.8b}, [%1], #8                       \n"
317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1       {v2.s}[2], [%1], #4                     \n"
319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt      1b                                      \n"
320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),          // %1
322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)         // %2
323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(&kShuf38)           // %3
324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "memory", "cc"
325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 32x3 -> 12x1
329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                      ptrdiff_t src_stride,
331b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                      uint8* dst_ptr,
332b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                      int dst_width) {
333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint8* src_ptr1 = src_ptr + src_stride * 2;
334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ptrdiff_t tmp_src_stride = src_stride;
335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
337f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(5)
338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1       {v29.8h}, [%5]                          \n"
339f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(6)
340f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1       {v30.16b}, [%6]                         \n"
341f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(7)
342f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1       {v31.8h}, [%7]                          \n"
343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       %2, %2, %0                              \n"
344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                                  \n"
345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 00 40 01 41 02 42 03 43
347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 10 50 11 51 12 52 13 53
348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 20 60 21 61 22 62 23 63
349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 30 70 31 71 32 72 33 73
350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
352f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(3)
353f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
354f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(4)
355f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
356f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs      %w4, %w4, #12                           \n"
357f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
358f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Shuffle the input data around to get align the data
359f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
360f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 00 10 01 11 02 12 03 13
361f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 40 50 41 51 42 52 43 53
362f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v20.8b, v0.8b, v1.8b                    \n"
363f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v21.8b, v0.8b, v1.8b                    \n"
364f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v22.8b, v4.8b, v5.8b                    \n"
365f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v23.8b, v4.8b, v5.8b                    \n"
366f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v24.8b, v16.8b, v17.8b                  \n"
367f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v25.8b, v16.8b, v17.8b                  \n"
368f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
369f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 20 30 21 31 22 32 23 33
370f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 60 70 61 71 62 72 63 73
371f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v0.8b, v2.8b, v3.8b                     \n"
372f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v1.8b, v2.8b, v3.8b                     \n"
373f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v4.8b, v6.8b, v7.8b                     \n"
374f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v5.8b, v6.8b, v7.8b                     \n"
375f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v16.8b, v18.8b, v19.8b                  \n"
376f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v17.8b, v18.8b, v19.8b                  \n"
377f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
378f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 00+10 01+11 02+12 03+13
379f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 40+50 41+51 42+52 43+53
380f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v20.4h, v20.8b                          \n"
381f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v21.4h, v21.8b                          \n"
382f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v22.4h, v22.8b                          \n"
383f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v23.4h, v23.8b                          \n"
384f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v24.4h, v24.8b                          \n"
385f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v25.4h, v25.8b                          \n"
386f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
387f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 60+70 61+71 62+72 63+73
388f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v1.4h, v1.8b                            \n"
389f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v5.4h, v5.8b                            \n"
390f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v17.4h, v17.8b                          \n"
391f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
392f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // combine source lines
393f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v20.4h, v20.4h, v22.4h                  \n"
394f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v21.4h, v21.4h, v23.4h                  \n"
395f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v20.4h, v20.4h, v24.4h                  \n"
396f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v21.4h, v21.4h, v25.4h                  \n"
397f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v2.4h, v1.4h, v5.4h                     \n"
398f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v2.4h, v2.4h, v17.4h                    \n"
399f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
400f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
401f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //             + s[6 + st * 1] + s[7 + st * 1]
402f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
403f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
404f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xtn       v2.8b,  v2.8h                           \n"
405f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
406f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Shuffle 2,3 reg around so that 2 can be added to the
407f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  0,1 reg and 3 can be added to the 4,5 reg. This
408f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  requires expanding from u8 to u16 as the 0,1 and 4,5
409f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  registers are already expanded. Then do transposes
410f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  to get aligned.
411f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
412f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v16.8h, v16.8b, #0                      \n"
413f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
414f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
415f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // combine source lines
416f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v0.8h, v0.8h, v16.8h                    \n"
417f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
418f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // xx 20 xx 21 xx 22 xx 23
419f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // xx 30 xx 31 xx 32 xx 33
420f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v1.8h, v0.8h, v0.8h                     \n"
421f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v4.8h, v0.8h, v0.8h                     \n"
422f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xtn       v0.4h, v1.4s                            \n"
423f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xtn       v4.4h, v4.4s                            \n"
424f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
425f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 0+1+2, 3+4+5
426f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v20.8h, v20.8h, v0.8h                   \n"
427f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v21.8h, v21.8h, v4.8h                   \n"
428f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
429f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Need to divide, but can't downshift as the the value
430f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  isn't a power of 2. So multiply by 65536 / n
431f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  and take the upper 16 bits.
432f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
433f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
434f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
435f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Align for table lookup, vtbl requires registers to
436f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  be adjacent
437f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
438f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
439f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
440f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1       {v3.8b}, [%1], #8                       \n"
441f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
442f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1       {v3.s}[2], [%1], #4                     \n"
443f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt      1b                                      \n"
444f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),          // %0
445f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),          // %1
446f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(tmp_src_stride),   // %2
447f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_ptr1),         // %3
448f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)         // %4
449f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(&kMult38_Div6),     // %5
450f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "r"(&kShuf38_2),        // %6
451f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "r"(&kMult38_Div9)      // %7
452f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
453f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
454f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "v30", "v31", "memory", "cc"
455f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
456f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
457f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
458f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 32x2 -> 12x1
459f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
460f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               ptrdiff_t src_stride,
461b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               uint8* dst_ptr,
462b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               int dst_width) {
463f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // TODO(fbarchard): use src_stride directly for clang 3.5+.
464f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ptrdiff_t tmp_src_stride = src_stride;
465f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
466f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(4)
467f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1       {v30.8h}, [%4]                          \n"
468f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(5)
469f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1       {v31.16b}, [%5]                         \n"
470f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       %2, %2, %0                              \n"
471f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                                  \n"
472f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
473f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 00 40 01 41 02 42 03 43
474f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 10 50 11 51 12 52 13 53
475f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 20 60 21 61 22 62 23 63
476f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 30 70 31 71 32 72 33 73
477f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
478f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
479f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(3)
480f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
481f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs      %w3, %w3, #12                           \n"
482f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
483f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Shuffle the input data around to get align the data
484f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
485f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 00 10 01 11 02 12 03 13
486f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 40 50 41 51 42 52 43 53
487f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v16.8b, v0.8b, v1.8b                    \n"
488f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v17.8b, v0.8b, v1.8b                    \n"
489f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v18.8b, v4.8b, v5.8b                    \n"
490f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v19.8b, v4.8b, v5.8b                    \n"
491f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
492f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 20 30 21 31 22 32 23 33
493f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 60 70 61 71 62 72 63 73
494f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v0.8b, v2.8b, v3.8b                     \n"
495f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v1.8b, v2.8b, v3.8b                     \n"
496f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v4.8b, v6.8b, v7.8b                     \n"
497f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v5.8b, v6.8b, v7.8b                     \n"
498f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
499f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 00+10 01+11 02+12 03+13
500f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 40+50 41+51 42+52 43+53
501f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v16.4h, v16.8b                          \n"
502f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v17.4h, v17.8b                          \n"
503f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v18.4h, v18.8b                          \n"
504f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v19.4h, v19.8b                          \n"
505f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
506f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 60+70 61+71 62+72 63+73
507f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v1.4h, v1.8b                            \n"
508f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp    v5.4h, v5.8b                            \n"
509f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
510f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // combine source lines
511f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v16.4h, v16.4h, v18.4h                  \n"
512f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v17.4h, v17.4h, v19.4h                  \n"
513f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v2.4h, v1.4h, v5.4h                     \n"
514f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
515f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
516f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uqrshrn   v2.8b, v2.8h, #2                        \n"
517f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
518f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Shuffle 2,3 reg around so that 2 can be added to the
519f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  0,1 reg and 3 can be added to the 4,5 reg. This
520f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  requires expanding from u8 to u16 as the 0,1 and 4,5
521f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  registers are already expanded. Then do transposes
522f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  to get aligned.
523f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
524f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
525f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // combine source lines
526f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
527f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
528f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // xx 20 xx 21 xx 22 xx 23
529f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // xx 30 xx 31 xx 32 xx 33
530f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn1      v1.8h, v0.8h, v0.8h                     \n"
531f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "trn2      v4.8h, v0.8h, v0.8h                     \n"
532f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xtn       v0.4h, v1.4s                            \n"
533f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xtn       v4.4h, v4.4s                            \n"
534f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
535f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 0+1+2, 3+4+5
536f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v16.8h, v16.8h, v0.8h                   \n"
537f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v17.8h, v17.8h, v4.8h                   \n"
538f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
539f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Need to divide, but can't downshift as the the value
540f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  isn't a power of 2. So multiply by 65536 / n
541f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  and take the upper 16 bits.
542f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
543f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
544f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
545f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Align for table lookup, vtbl requires registers to
546f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //  be adjacent
547f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
548f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
549f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
550f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
551f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1       {v3.8b}, [%1], #8                       \n"
552f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
553f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1       {v3.s}[2], [%1], #4                     \n"
554f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt      1b                                      \n"
555f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_ptr),         // %0
556f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),         // %1
557f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(tmp_src_stride),  // %2
558f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)        // %3
559f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(&kMult38_Div6),    // %4
560f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "r"(&kShuf38_2)        // %5
561f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
562f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "v18", "v19", "v30", "v31", "memory", "cc"
563f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
564f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
565f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
566b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleAddRows_NEON(const uint8* src_ptr,
567b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       ptrdiff_t src_stride,
568b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint16* dst_ptr,
569b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int src_width,
570b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int src_height) {
571f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint8* src_tmp;
572f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
573f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
574f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %0, %1                          \n"
575f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       w12, %w5                        \n"
576f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "eor       v2.16b, v2.16b, v2.16b          \n"
577f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "eor       v3.16b, v3.16b, v3.16b          \n"
578f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "2:                                          \n"
579f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // load 16 pixels into q0
580f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
581f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1       {v0.16b}, [%0], %3              \n"
582f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
583f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddw     v2.8h, v2.8h, v0.8b             \n"
584f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs      w12, w12, #1                    \n"
585f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt      2b                              \n"
586f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(2)
587f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
588f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add      %1, %1, #16                      \n"
589f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
590f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt     1b                               \n"
591f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "=&r"(src_tmp),    // %0
592f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_ptr),     // %1
593f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_ptr),     // %2
594f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_stride),  // %3
595f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_width),   // %4
596f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_height)   // %5
597f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
598f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
599f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
600f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
601f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
602b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// clang-format off
603f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// TODO(Yang Zhang): Investigate less load instructions for
604f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// the x/dx stepping
605b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define LOAD2_DATA8_LANE(n)                                 \
606b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "lsr        %5, %3, #16                    \n"            \
607b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "add        %6, %1, %5                     \n"            \
608b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "add        %3, %3, %4                     \n"            \
609b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  MEMACCESS(6)                                              \
610b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
611b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// clang-format on
612b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
613b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// The NEON version mimics this formula (from row_common.cc):
614b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// #define BLENDER(a, b, f) (uint8)((int)(a) +
615b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
616b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
617b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleFilterCols_NEON(uint8* dst_ptr,
618b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* src_ptr,
619b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int dst_width,
620b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int x,
621b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int dx) {
622f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  int dx_offset[4] = {0, 1, 2, 3};
623f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  int* tmp = dx_offset;
624f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint8* src_tmp = src_ptr;
625b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
626b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 x64 = (int64)x;
627b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 dx64 = (int64)dx;
628f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
629f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup        v0.4s, %w3                     \n"  // x
630f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup        v1.4s, %w4                     \n"  // dx
631f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
632f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
633f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mul        v1.4s, v1.4s, v2.4s            \n"
634f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
635f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        v1.4s, v1.4s, v0.4s            \n"
636f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
637f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        v2.4s, v1.4s, v3.4s            \n"
638f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
639f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
640f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA8_LANE(0)
641f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA8_LANE(1)
642f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA8_LANE(2)
643f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA8_LANE(3)
644f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA8_LANE(4)
645f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA8_LANE(5)
646f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA8_LANE(6)
647f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA8_LANE(7)
648f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       v6.16b, v1.16b                  \n"
649f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       v7.16b, v2.16b                  \n"
650f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uzp1      v6.8h, v6.8h, v7.8h             \n"
651f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v4.8h, v4.8b, #0                \n"
652f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v5.8h, v5.8b, #0                \n"
653f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ssubl     v16.4s, v5.4h, v4.4h            \n"
654f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
655f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll     v7.4s, v6.4h, #0                \n"
656f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ushll2    v6.4s, v6.8h, #0                \n"
657f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mul       v16.4s, v16.4s, v7.4s           \n"
658f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mul       v17.4s, v17.4s, v6.4s           \n"
659b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "rshrn     v6.4h, v16.4s, #16              \n"
660b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "rshrn2    v6.8h, v17.4s, #16              \n"
661f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v4.8h, v4.8h, v6.8h             \n"
662f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xtn       v4.8b, v4.8h                    \n"
663f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
664f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
665f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
666f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v1.4s, v1.4s, v0.4s             \n"
667f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       v2.4s, v2.4s, v0.4s             \n"
668f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
669f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt      1b                              \n"
670f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_ptr),          // %0
671f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_ptr),          // %1
672f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width64),      // %2
673f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(x64),              // %3
674f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dx64),             // %4
675f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(tmp),              // %5
676f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_tmp)           // %6
677f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
678f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "v0", "v1", "v2", "v3",
679f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "v4", "v5", "v6", "v7", "v16", "v17"
680f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
681f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
682f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
683f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef LOAD2_DATA8_LANE
684f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
685f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16x2 -> 16x1
686f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ScaleFilterRows_NEON(uint8* dst_ptr,
687b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* src_ptr,
688b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          ptrdiff_t src_stride,
689b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int dst_width,
690b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int source_y_fraction) {
691b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int y_fraction = 256 - source_y_fraction;
692f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
693f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp          %w4, #0                      \n"
694f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.eq         100f                         \n"
695f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add          %2, %2, %1                   \n"
696f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp          %w4, #64                     \n"
697f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.eq         75f                          \n"
698f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp          %w4, #128                    \n"
699f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.eq         50f                          \n"
700f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp          %w4, #192                    \n"
701f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.eq         25f                          \n"
702f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
703f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup          v5.8b, %w4                   \n"
704f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup          v4.8b, %w5                   \n"
705f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // General purpose row blend.
706f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
707f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
708f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v0.16b}, [%1], #16          \n"
709f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(2)
710f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v1.16b}, [%2], #16          \n"
711f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs         %w3, %w3, #16                \n"
712f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umull        v6.8h, v0.8b, v4.8b          \n"
713f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umull2       v7.8h, v0.16b, v4.16b        \n"
714f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal        v6.8h, v1.8b, v5.8b          \n"
715f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umlal2       v7.8h, v1.16b, v5.16b        \n"
716f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn        v0.8b, v6.8h, #8             \n"
717f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn2       v0.16b, v7.8h, #8            \n"
718f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
719f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1          {v0.16b}, [%0], #16          \n"
720f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt         1b                           \n"
721f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b            99f                          \n"
722f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
723f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Blend 25 / 75.
724f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "25:                                         \n"
725f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
726f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v0.16b}, [%1], #16          \n"
727f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(2)
728f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v1.16b}, [%2], #16          \n"
729f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs         %w3, %w3, #16                \n"
730f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd       v0.16b, v0.16b, v1.16b       \n"
731f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd       v0.16b, v0.16b, v1.16b       \n"
732f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
733f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1          {v0.16b}, [%0], #16          \n"
734f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt         25b                          \n"
735f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b            99f                          \n"
736f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
737f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Blend 50 / 50.
738f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "50:                                         \n"
739f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
740f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v0.16b}, [%1], #16          \n"
741f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(2)
742f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v1.16b}, [%2], #16          \n"
743f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs         %w3, %w3, #16                \n"
744f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd       v0.16b, v0.16b, v1.16b       \n"
745f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
746f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1          {v0.16b}, [%0], #16          \n"
747f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt         50b                          \n"
748f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b            99f                          \n"
749f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
750f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Blend 75 / 25.
751f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "75:                                         \n"
752f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
753f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v1.16b}, [%1], #16          \n"
754f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(2)
755f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v0.16b}, [%2], #16          \n"
756f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs         %w3, %w3, #16                \n"
757f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd       v0.16b, v0.16b, v1.16b       \n"
758f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "urhadd       v0.16b, v0.16b, v1.16b       \n"
759f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
760f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1          {v0.16b}, [%0], #16          \n"
761f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt         75b                          \n"
762f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b            99f                          \n"
763f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
764f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Blend 100 / 0 - Copy row unchanged.
765f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "100:                                        \n"
766f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
767f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1          {v0.16b}, [%1], #16          \n"
768f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs         %w3, %w3, #16                \n"
769f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
770f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1          {v0.16b}, [%0], #16          \n"
771f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt         100b                         \n"
772f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
773f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "99:                                         \n"
774f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
775f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1          {v0.b}[15], [%0]             \n"
776f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_ptr),          // %0
777f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_ptr),          // %1
778f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_stride),       // %2
779f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width),        // %3
780f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(source_y_fraction),// %4
781f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(y_fraction)        // %5
782f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
783f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
784f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
785f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
786f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
787b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleARGBRowDown2_NEON(const uint8* src_ptr,
788b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                            ptrdiff_t src_stride,
789b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                            uint8* dst,
790b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                            int dst_width) {
791b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  (void)src_stride;
792f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
793f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
794f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // load even pixels into q0, odd into q1
795f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (0)
796f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
797f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (0)
798f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
799f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
800f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (1)
801f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
802f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (1)
803f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1        {v3.16b}, [%1], #16            \n"
804f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
805f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r" (src_ptr),          // %0
806f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r" (dst),              // %1
807f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r" (dst_width)         // %2
808f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
809f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
810f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
811f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
812f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
813b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
814b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  ptrdiff_t src_stride,
815b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  uint8* dst_argb,
816b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  int dst_width) {
817b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  (void)src_stride;
818f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
819f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
820f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (0)
821f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // load 8 ARGB pixels.
822f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
823f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
824f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
825f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
826f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
827f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
828f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
829f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v1.8b, v1.8h, #1               \n"
830f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v2.8b, v2.8h, #1               \n"
831f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v3.8b, v3.8h, #1               \n"
832f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (1)
833f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
834f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
835f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),         // %0
836f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),         // %1
837f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)         // %2
838f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
839f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
840f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
841f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
842f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
843b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
844b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               ptrdiff_t src_stride,
845b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               uint8* dst,
846b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               int dst_width) {
847f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
848f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // change the stride to row 2 pointer
849f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        %1, %1, %0                     \n"
850f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
851f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (0)
852f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
853f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
854f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
855f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
856f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
857f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
858f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (1)
859f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
860f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
861f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
862f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
863f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
864f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
865f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v1.8b, v1.8h, #2               \n"
866f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v2.8b, v2.8h, #2               \n"
867f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v3.8b, v3.8h, #2               \n"
868f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS (2)
869f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
870f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
871f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r" (src_ptr),          // %0
872f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r" (src_stride),       // %1
873f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r" (dst),              // %2
874f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r" (dst_width)         // %3
875f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
876f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
877f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
878f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
879f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
880f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Reads 4 pixels at a time.
881f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Alignment requirement: src_argb 4 byte aligned.
882b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleARGBRowDownEven_NEON(const uint8* src_argb,
883b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               ptrdiff_t src_stride,
884b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               int src_stepx,
885b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               uint8* dst_argb,
886b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               int dst_width) {
887b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  (void)src_stride;
888f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
889f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
890f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
891f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v0.s}[0], [%0], %3            \n"
892f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
893f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v0.s}[1], [%0], %3            \n"
894f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
895f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v0.s}[2], [%0], %3            \n"
896f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
897f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v0.s}[3], [%0], %3            \n"
898f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
899f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
900f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1        {v0.16b}, [%1], #16            \n"
901f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
902f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),    // %0
903f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),    // %1
904f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)    // %2
905f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((int64)(src_stepx * 4)) // %3
906f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "v0"
907f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
908f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
909f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
910f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Reads 4 pixels at a time.
911f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Alignment requirement: src_argb 4 byte aligned.
912f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// TODO(Yang Zhang): Might be worth another optimization pass in future.
913f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// It could be upgraded to 8 pixels at a time to start with.
914b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
915b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  ptrdiff_t src_stride,
916f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                  int src_stepx,
917b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  uint8* dst_argb,
918b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  int dst_width) {
919f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
920f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        %1, %1, %0                     \n"
921f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
922f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
923f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
924f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
925f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v1.8b}, [%1], %4              \n"
926f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
927f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v2.8b}, [%0], %4              \n"
928f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
929f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v3.8b}, [%1], %4              \n"
930f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
931f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v4.8b}, [%0], %4              \n"
932f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
933f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v5.8b}, [%1], %4              \n"
934f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
935f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v6.8b}, [%0], %4              \n"
936f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(1)
937f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v7.8b}, [%1], %4              \n"
938f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddl      v0.8h, v0.8b, v1.8b            \n"
939f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddl      v2.8h, v2.8b, v3.8b            \n"
940f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddl      v4.8h, v4.8b, v5.8b            \n"
941f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "uaddl      v6.8h, v6.8b, v7.8b            \n"
942f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
943f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        v0.d[1], v2.d[0]               \n"
944f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        v2.d[0], v16.d[1]              \n"
945f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
946f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        v4.d[1], v6.d[0]               \n"
947f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        v6.d[0], v16.d[1]              \n"
948f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
949f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
950f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
951f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
952f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
953f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(2)
954f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1     {v0.16b}, [%2], #16               \n"
955f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt       1b                             \n"
956f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),    // %0
957f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_stride),  // %1
958f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),    // %2
959f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width)    // %3
960f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((int64)(src_stepx * 4)) // %4
961f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
962f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
963f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
964f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
965b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// clang-format off
966f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// TODO(Yang Zhang): Investigate less load instructions for
967f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// the x/dx stepping
968b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define LOAD1_DATA32_LANE(vn, n)                            \
969b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "lsr        %5, %3, #16                    \n"            \
970b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "add        %6, %1, %5, lsl #2             \n"            \
971b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "add        %3, %3, %4                     \n"            \
972b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  MEMACCESS(6)                                              \
973b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard "ld1        {" #vn ".s}[" #n "], [%6]       \n"
974b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// clang-format on
975b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
976b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleARGBCols_NEON(uint8* dst_argb,
977b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        const uint8* src_argb,
978b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int dst_width,
979b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int x,
980b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int dx) {
981f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint8* src_tmp = src_argb;
982b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
983b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 x64 = (int64)x;
984b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 dx64 = (int64)dx;
985f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  int64 tmp64;
986f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
987f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
988f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD1_DATA32_LANE(v0, 0)
989f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD1_DATA32_LANE(v0, 1)
990f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD1_DATA32_LANE(v0, 2)
991f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD1_DATA32_LANE(v0, 3)
992f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD1_DATA32_LANE(v1, 0)
993f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD1_DATA32_LANE(v1, 1)
994f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD1_DATA32_LANE(v1, 2)
995f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD1_DATA32_LANE(v1, 3)
996f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
997f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
998f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
999f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
1000f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt        1b                            \n"
1001f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_argb),     // %0
1002f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb),     // %1
1003f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width64),  // %2
1004f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(x64),          // %3
1005f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dx64),         // %4
1006f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "=&r"(tmp64),       // %5
1007f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_tmp)       // %6
1008f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
1009f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "v0", "v1"
1010f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1011f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1012f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1013f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef LOAD1_DATA32_LANE
1014f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1015b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// clang-format off
1016f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// TODO(Yang Zhang): Investigate less load instructions for
1017f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// the x/dx stepping
1018b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define LOAD2_DATA32_LANE(vn1, vn2, n)                             \
1019b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "lsr        %5, %3, #16                           \n"            \
1020b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "add        %6, %1, %5, lsl #2                    \n"            \
1021b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "add        %3, %3, %4                            \n"            \
1022b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  MEMACCESS(6)                                                     \
1023b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
1024b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// clang-format on
1025b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1026b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ScaleARGBFilterCols_NEON(uint8* dst_argb,
1027b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              const uint8* src_argb,
1028b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              int dst_width,
1029b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              int x,
1030b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              int dx) {
1031f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  int dx_offset[4] = {0, 1, 2, 3};
1032f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  int* tmp = dx_offset;
1033f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint8* src_tmp = src_argb;
1034b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
1035b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 x64 = (int64)x;
1036b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  int64 dx64 = (int64)dx;
1037f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1038f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup        v0.4s, %w3                     \n"  // x
1039f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup        v1.4s, %w4                     \n"  // dx
1040f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
1041f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
1042f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mul        v1.4s, v1.4s, v2.4s            \n"
1043f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movi       v3.16b, #0x7f                  \n"  // 0x7F
1044f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movi       v4.8h, #0x7f                   \n"  // 0x7F
1045f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
1046f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        v5.4s, v1.4s, v0.4s            \n"
1047f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "1:                                          \n"
1048f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // d0, d1: a
1049f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // d2, d3: b
1050f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA32_LANE(v0, v1, 0)
1051f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA32_LANE(v0, v1, 1)
1052f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA32_LANE(v0, v1, 2)
1053f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LOAD2_DATA32_LANE(v0, v1, 3)
1054f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shrn       v2.4h, v5.4s, #9               \n"
1055f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "and        v2.8b, v2.8b, v4.8b            \n"
1056f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup        v16.8b, v2.b[0]                \n"
1057f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup        v17.8b, v2.b[2]                \n"
1058f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup        v18.8b, v2.b[4]                \n"
1059f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dup        v19.8b, v2.b[6]                \n"
1060f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
1061f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
1062f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ins        v2.d[1], v17.d[0]              \n"  // f
1063f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
1064f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umull      v16.8h, v0.8b, v7.8b           \n"
1065f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umull2     v17.8h, v0.16b, v7.16b         \n"
1066f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umull      v18.8h, v1.8b, v2.8b           \n"
1067f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "umull2     v19.8h, v1.16b, v2.16b         \n"
1068f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        v16.8h, v16.8h, v18.8h         \n"
1069f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        v17.8h, v17.8h, v19.8h         \n"
1070f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shrn       v0.8b, v16.8h, #7              \n"
1071f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shrn2      v0.16b, v17.8h, #7             \n"
1072f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1073f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMACCESS(0)
1074f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
1075f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add     v5.4s, v5.4s, v6.4s               \n"
1076f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
1077f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "b.gt    1b                                \n"
1078f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_argb),         // %0
1079f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb),         // %1
1080f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_width64),      // %2
1081f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(x64),              // %3
1082f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dx64),             // %4
1083f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(tmp),              // %5
1084f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_tmp)           // %6
1085f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
1086f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1087f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "v6", "v7", "v16", "v17", "v18", "v19"
1088f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1089f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1090f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1091f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef LOAD2_DATA32_LANE
1092f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1093f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1094f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1095f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
1096f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}  // extern "C"
1097f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}  // namespace libyuv
1098f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
1099