1/*
2 *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/scale.h"
12#include "libyuv/row.h"
13#include "libyuv/scale_row.h"
14
15#ifdef __cplusplus
16namespace libyuv {
17extern "C" {
18#endif
19
20// This module is for GCC Neon armv8 64 bit.
21#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22
23// Read 32x1 throw away even pixels, and write 16x1.
24void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
25                        uint8* dst, int dst_width) {
26  asm volatile (
27  "1:                                          \n"
28    // load even pixels into v0, odd into v1
29    MEMACCESS(0)
30    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
31    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
32    MEMACCESS(1)
33    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
34    "b.gt       1b                             \n"
35  : "+r"(src_ptr),          // %0
36    "+r"(dst),              // %1
37    "+r"(dst_width)         // %2
38  :
39  : "v0", "v1"              // Clobber List
40  );
41}
42
43// Read 32x1 average down and write 16x1.
44void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
45                           uint8* dst, int dst_width) {
46  asm volatile (
47  "1:                                          \n"
48    MEMACCESS(0)
49    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
50    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
51    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
52    "uaddlp     v1.8h, v1.16b                  \n"
53    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
54    "rshrn2     v0.16b, v1.8h, #1              \n"
55    MEMACCESS(1)
56    "st1        {v0.16b}, [%1], #16            \n"
57    "b.gt       1b                             \n"
58  : "+r"(src_ptr),          // %0
59    "+r"(dst),              // %1
60    "+r"(dst_width)         // %2
61  :
62  : "v0", "v1"     // Clobber List
63  );
64}
65
66// Read 32x2 average down and write 16x1.
67void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
68                           uint8* dst, int dst_width) {
69  asm volatile (
70    // change the stride to row 2 pointer
71    "add        %1, %1, %0                     \n"
72  "1:                                          \n"
73    MEMACCESS(0)
74    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
75    MEMACCESS(1)
76    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
77    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
78    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
79    "uaddlp     v1.8h, v1.16b                  \n"
80    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
81    "uadalp     v1.8h, v3.16b                  \n"
82    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
83    "rshrn2     v0.16b, v1.8h, #2              \n"
84    MEMACCESS(2)
85    "st1        {v0.16b}, [%2], #16            \n"
86    "b.gt       1b                             \n"
87  : "+r"(src_ptr),          // %0
88    "+r"(src_stride),       // %1
89    "+r"(dst),              // %2
90    "+r"(dst_width)         // %3
91  :
92  : "v0", "v1", "v2", "v3"     // Clobber List
93  );
94}
95
96void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
97                        uint8* dst_ptr, int dst_width) {
98  asm volatile (
99  "1:                                          \n"
100    MEMACCESS(0)
101    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
102    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
103    MEMACCESS(1)
104    "st1     {v2.8b}, [%1], #8                 \n"
105    "b.gt       1b                             \n"
106  : "+r"(src_ptr),          // %0
107    "+r"(dst_ptr),          // %1
108    "+r"(dst_width)         // %2
109  :
110  : "v0", "v1", "v2", "v3", "memory", "cc"
111  );
112}
113
114void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
115                           uint8* dst_ptr, int dst_width) {
116  const uint8* src_ptr1 = src_ptr + src_stride;
117  const uint8* src_ptr2 = src_ptr + src_stride * 2;
118  const uint8* src_ptr3 = src_ptr + src_stride * 3;
119asm volatile (
120  "1:                                          \n"
121    MEMACCESS(0)
122    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
123    MEMACCESS(3)
124    "ld1     {v1.16b}, [%2], #16               \n"
125    MEMACCESS(4)
126    "ld1     {v2.16b}, [%3], #16               \n"
127    MEMACCESS(5)
128    "ld1     {v3.16b}, [%4], #16               \n"
129    "subs    %w5, %w5, #4                      \n"
130    "uaddlp  v0.8h, v0.16b                     \n"
131    "uadalp  v0.8h, v1.16b                     \n"
132    "uadalp  v0.8h, v2.16b                     \n"
133    "uadalp  v0.8h, v3.16b                     \n"
134    "addp    v0.8h, v0.8h, v0.8h               \n"
135    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
136    MEMACCESS(1)
137    "st1    {v0.s}[0], [%1], #4                \n"
138    "b.gt       1b                             \n"
139  : "+r"(src_ptr),   // %0
140    "+r"(dst_ptr),   // %1
141    "+r"(src_ptr1),  // %2
142    "+r"(src_ptr2),  // %3
143    "+r"(src_ptr3),  // %4
144    "+r"(dst_width)  // %5
145  :
146  : "v0", "v1", "v2", "v3", "memory", "cc"
147  );
148}
149
150// Down scale from 4 to 3 pixels. Use the neon multilane read/write
151// to load up the every 4th pixel into a 4 different registers.
152// Point samples 32 pixels to 24 pixels.
153void ScaleRowDown34_NEON(const uint8* src_ptr,
154                         ptrdiff_t src_stride,
155                         uint8* dst_ptr, int dst_width) {
156  asm volatile (
157  "1:                                                  \n"
158    MEMACCESS(0)
159    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
160    "subs      %w2, %w2, #24                           \n"
161    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
162    MEMACCESS(1)
163    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
164    "b.gt      1b                                      \n"
165  : "+r"(src_ptr),          // %0
166    "+r"(dst_ptr),          // %1
167    "+r"(dst_width)         // %2
168  :
169  : "v0", "v1", "v2", "v3", "memory", "cc"
170  );
171}
172
173void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
174                               ptrdiff_t src_stride,
175                               uint8* dst_ptr, int dst_width) {
176  asm volatile (
177    "movi      v20.8b, #3                              \n"
178    "add       %3, %3, %0                              \n"
179  "1:                                                  \n"
180    MEMACCESS(0)
181    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
182    MEMACCESS(3)
183    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
184    "subs         %w2, %w2, #24                        \n"
185
186    // filter src line 0 with src line 1
187    // expand chars to shorts to allow for room
188    // when adding lines together
189    "ushll     v16.8h, v4.8b, #0                       \n"
190    "ushll     v17.8h, v5.8b, #0                       \n"
191    "ushll     v18.8h, v6.8b, #0                       \n"
192    "ushll     v19.8h, v7.8b, #0                       \n"
193
194    // 3 * line_0 + line_1
195    "umlal     v16.8h, v0.8b, v20.8b                   \n"
196    "umlal     v17.8h, v1.8b, v20.8b                   \n"
197    "umlal     v18.8h, v2.8b, v20.8b                   \n"
198    "umlal     v19.8h, v3.8b, v20.8b                   \n"
199
200    // (3 * line_0 + line_1) >> 2
201    "uqrshrn   v0.8b, v16.8h, #2                       \n"
202    "uqrshrn   v1.8b, v17.8h, #2                       \n"
203    "uqrshrn   v2.8b, v18.8h, #2                       \n"
204    "uqrshrn   v3.8b, v19.8h, #2                       \n"
205
206    // a0 = (src[0] * 3 + s[1] * 1) >> 2
207    "ushll     v16.8h, v1.8b, #0                       \n"
208    "umlal     v16.8h, v0.8b, v20.8b                   \n"
209    "uqrshrn   v0.8b, v16.8h, #2                       \n"
210
211    // a1 = (src[1] * 1 + s[2] * 1) >> 1
212    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
213
214    // a2 = (src[2] * 1 + s[3] * 3) >> 2
215    "ushll     v16.8h, v2.8b, #0                       \n"
216    "umlal     v16.8h, v3.8b, v20.8b                   \n"
217    "uqrshrn   v2.8b, v16.8h, #2                       \n"
218
219    MEMACCESS(1)
220    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
221
222    "b.gt      1b                                      \n"
223  : "+r"(src_ptr),          // %0
224    "+r"(dst_ptr),          // %1
225    "+r"(dst_width),        // %2
226    "+r"(src_stride)        // %3
227  :
228  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
229    "v20", "memory", "cc"
230  );
231}
232
233void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
234                               ptrdiff_t src_stride,
235                               uint8* dst_ptr, int dst_width) {
236  asm volatile (
237    "movi      v20.8b, #3                              \n"
238    "add       %3, %3, %0                              \n"
239  "1:                                                  \n"
240    MEMACCESS(0)
241    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
242    MEMACCESS(3)
243    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
244    "subs         %w2, %w2, #24                        \n"
245    // average src line 0 with src line 1
246    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
247    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
248    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
249    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
250
251    // a0 = (src[0] * 3 + s[1] * 1) >> 2
252    "ushll     v4.8h, v1.8b, #0                        \n"
253    "umlal     v4.8h, v0.8b, v20.8b                    \n"
254    "uqrshrn   v0.8b, v4.8h, #2                        \n"
255
256    // a1 = (src[1] * 1 + s[2] * 1) >> 1
257    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
258
259    // a2 = (src[2] * 1 + s[3] * 3) >> 2
260    "ushll     v4.8h, v2.8b, #0                        \n"
261    "umlal     v4.8h, v3.8b, v20.8b                    \n"
262    "uqrshrn   v2.8b, v4.8h, #2                        \n"
263
264    MEMACCESS(1)
265    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
266    "b.gt      1b                                      \n"
267  : "+r"(src_ptr),          // %0
268    "+r"(dst_ptr),          // %1
269    "+r"(dst_width),        // %2
270    "+r"(src_stride)        // %3
271  :
272  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
273  );
274}
275
276static uvec8 kShuf38 =
277  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
278static uvec8 kShuf38_2 =
279  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
280static vec16 kMult38_Div6 =
281  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
282    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
283static vec16 kMult38_Div9 =
284  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
285    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
286
287// 32 -> 12
288void ScaleRowDown38_NEON(const uint8* src_ptr,
289                         ptrdiff_t src_stride,
290                         uint8* dst_ptr, int dst_width) {
291  asm volatile (
292    MEMACCESS(3)
293    "ld1       {v3.16b}, [%3]                          \n"
294  "1:                                                  \n"
295    MEMACCESS(0)
296    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
297    "subs      %w2, %w2, #12                           \n"
298    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
299    MEMACCESS(1)
300    "st1       {v2.8b}, [%1], #8                       \n"
301    MEMACCESS(1)
302    "st1       {v2.s}[2], [%1], #4                     \n"
303    "b.gt      1b                                      \n"
304  : "+r"(src_ptr),          // %0
305    "+r"(dst_ptr),          // %1
306    "+r"(dst_width)         // %2
307  : "r"(&kShuf38)           // %3
308  : "v0", "v1", "v2", "v3", "memory", "cc"
309  );
310}
311
312// 32x3 -> 12x1
313void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
314                                      ptrdiff_t src_stride,
315                                      uint8* dst_ptr, int dst_width) {
316  const uint8* src_ptr1 = src_ptr + src_stride * 2;
317  ptrdiff_t tmp_src_stride = src_stride;
318
319  asm volatile (
320    MEMACCESS(5)
321    "ld1       {v29.8h}, [%5]                          \n"
322    MEMACCESS(6)
323    "ld1       {v30.16b}, [%6]                         \n"
324    MEMACCESS(7)
325    "ld1       {v31.8h}, [%7]                          \n"
326    "add       %2, %2, %0                              \n"
327  "1:                                                  \n"
328
329    // 00 40 01 41 02 42 03 43
330    // 10 50 11 51 12 52 13 53
331    // 20 60 21 61 22 62 23 63
332    // 30 70 31 71 32 72 33 73
333    MEMACCESS(0)
334    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
335    MEMACCESS(3)
336    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
337    MEMACCESS(4)
338    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
339    "subs      %w4, %w4, #12                           \n"
340
341    // Shuffle the input data around to get align the data
342    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
343    // 00 10 01 11 02 12 03 13
344    // 40 50 41 51 42 52 43 53
345    "trn1      v20.8b, v0.8b, v1.8b                    \n"
346    "trn2      v21.8b, v0.8b, v1.8b                    \n"
347    "trn1      v22.8b, v4.8b, v5.8b                    \n"
348    "trn2      v23.8b, v4.8b, v5.8b                    \n"
349    "trn1      v24.8b, v16.8b, v17.8b                  \n"
350    "trn2      v25.8b, v16.8b, v17.8b                  \n"
351
352    // 20 30 21 31 22 32 23 33
353    // 60 70 61 71 62 72 63 73
354    "trn1      v0.8b, v2.8b, v3.8b                     \n"
355    "trn2      v1.8b, v2.8b, v3.8b                     \n"
356    "trn1      v4.8b, v6.8b, v7.8b                     \n"
357    "trn2      v5.8b, v6.8b, v7.8b                     \n"
358    "trn1      v16.8b, v18.8b, v19.8b                  \n"
359    "trn2      v17.8b, v18.8b, v19.8b                  \n"
360
361    // 00+10 01+11 02+12 03+13
362    // 40+50 41+51 42+52 43+53
363    "uaddlp    v20.4h, v20.8b                          \n"
364    "uaddlp    v21.4h, v21.8b                          \n"
365    "uaddlp    v22.4h, v22.8b                          \n"
366    "uaddlp    v23.4h, v23.8b                          \n"
367    "uaddlp    v24.4h, v24.8b                          \n"
368    "uaddlp    v25.4h, v25.8b                          \n"
369
370    // 60+70 61+71 62+72 63+73
371    "uaddlp    v1.4h, v1.8b                            \n"
372    "uaddlp    v5.4h, v5.8b                            \n"
373    "uaddlp    v17.4h, v17.8b                          \n"
374
375    // combine source lines
376    "add       v20.4h, v20.4h, v22.4h                  \n"
377    "add       v21.4h, v21.4h, v23.4h                  \n"
378    "add       v20.4h, v20.4h, v24.4h                  \n"
379    "add       v21.4h, v21.4h, v25.4h                  \n"
380    "add       v2.4h, v1.4h, v5.4h                     \n"
381    "add       v2.4h, v2.4h, v17.4h                    \n"
382
383    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
384    //             + s[6 + st * 1] + s[7 + st * 1]
385    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
386    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
387    "xtn       v2.8b,  v2.8h                           \n"
388
389    // Shuffle 2,3 reg around so that 2 can be added to the
390    //  0,1 reg and 3 can be added to the 4,5 reg. This
391    //  requires expanding from u8 to u16 as the 0,1 and 4,5
392    //  registers are already expanded. Then do transposes
393    //  to get aligned.
394    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
395    "ushll     v16.8h, v16.8b, #0                      \n"
396    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
397
398    // combine source lines
399    "add       v0.8h, v0.8h, v16.8h                    \n"
400
401    // xx 20 xx 21 xx 22 xx 23
402    // xx 30 xx 31 xx 32 xx 33
403    "trn1      v1.8h, v0.8h, v0.8h                     \n"
404    "trn2      v4.8h, v0.8h, v0.8h                     \n"
405    "xtn       v0.4h, v1.4s                            \n"
406    "xtn       v4.4h, v4.4s                            \n"
407
408    // 0+1+2, 3+4+5
409    "add       v20.8h, v20.8h, v0.8h                   \n"
410    "add       v21.8h, v21.8h, v4.8h                   \n"
411
412    // Need to divide, but can't downshift as the the value
413    //  isn't a power of 2. So multiply by 65536 / n
414    //  and take the upper 16 bits.
415    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
416    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
417
418    // Align for table lookup, vtbl requires registers to
419    //  be adjacent
420    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
421
422    MEMACCESS(1)
423    "st1       {v3.8b}, [%1], #8                       \n"
424    MEMACCESS(1)
425    "st1       {v3.s}[2], [%1], #4                     \n"
426    "b.gt      1b                                      \n"
427  : "+r"(src_ptr),          // %0
428    "+r"(dst_ptr),          // %1
429    "+r"(tmp_src_stride),   // %2
430    "+r"(src_ptr1),         // %3
431    "+r"(dst_width)         // %4
432  : "r"(&kMult38_Div6),     // %5
433    "r"(&kShuf38_2),        // %6
434    "r"(&kMult38_Div9)      // %7
435  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
436    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
437    "v30", "v31", "memory", "cc"
438  );
439}
440
441// 32x2 -> 12x1
442void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
443                               ptrdiff_t src_stride,
444                               uint8* dst_ptr, int dst_width) {
445  // TODO(fbarchard): use src_stride directly for clang 3.5+.
446  ptrdiff_t tmp_src_stride = src_stride;
447  asm volatile (
448    MEMACCESS(4)
449    "ld1       {v30.8h}, [%4]                          \n"
450    MEMACCESS(5)
451    "ld1       {v31.16b}, [%5]                         \n"
452    "add       %2, %2, %0                              \n"
453  "1:                                                  \n"
454
455    // 00 40 01 41 02 42 03 43
456    // 10 50 11 51 12 52 13 53
457    // 20 60 21 61 22 62 23 63
458    // 30 70 31 71 32 72 33 73
459    MEMACCESS(0)
460    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
461    MEMACCESS(3)
462    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
463    "subs      %w3, %w3, #12                           \n"
464
465    // Shuffle the input data around to get align the data
466    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
467    // 00 10 01 11 02 12 03 13
468    // 40 50 41 51 42 52 43 53
469    "trn1      v16.8b, v0.8b, v1.8b                    \n"
470    "trn2      v17.8b, v0.8b, v1.8b                    \n"
471    "trn1      v18.8b, v4.8b, v5.8b                    \n"
472    "trn2      v19.8b, v4.8b, v5.8b                    \n"
473
474    // 20 30 21 31 22 32 23 33
475    // 60 70 61 71 62 72 63 73
476    "trn1      v0.8b, v2.8b, v3.8b                     \n"
477    "trn2      v1.8b, v2.8b, v3.8b                     \n"
478    "trn1      v4.8b, v6.8b, v7.8b                     \n"
479    "trn2      v5.8b, v6.8b, v7.8b                     \n"
480
481    // 00+10 01+11 02+12 03+13
482    // 40+50 41+51 42+52 43+53
483    "uaddlp    v16.4h, v16.8b                          \n"
484    "uaddlp    v17.4h, v17.8b                          \n"
485    "uaddlp    v18.4h, v18.8b                          \n"
486    "uaddlp    v19.4h, v19.8b                          \n"
487
488    // 60+70 61+71 62+72 63+73
489    "uaddlp    v1.4h, v1.8b                            \n"
490    "uaddlp    v5.4h, v5.8b                            \n"
491
492    // combine source lines
493    "add       v16.4h, v16.4h, v18.4h                  \n"
494    "add       v17.4h, v17.4h, v19.4h                  \n"
495    "add       v2.4h, v1.4h, v5.4h                     \n"
496
497    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
498    "uqrshrn   v2.8b, v2.8h, #2                        \n"
499
500    // Shuffle 2,3 reg around so that 2 can be added to the
501    //  0,1 reg and 3 can be added to the 4,5 reg. This
502    //  requires expanding from u8 to u16 as the 0,1 and 4,5
503    //  registers are already expanded. Then do transposes
504    //  to get aligned.
505    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
506
507    // combine source lines
508    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
509
510    // xx 20 xx 21 xx 22 xx 23
511    // xx 30 xx 31 xx 32 xx 33
512    "trn1      v1.8h, v0.8h, v0.8h                     \n"
513    "trn2      v4.8h, v0.8h, v0.8h                     \n"
514    "xtn       v0.4h, v1.4s                            \n"
515    "xtn       v4.4h, v4.4s                            \n"
516
517    // 0+1+2, 3+4+5
518    "add       v16.8h, v16.8h, v0.8h                   \n"
519    "add       v17.8h, v17.8h, v4.8h                   \n"
520
521    // Need to divide, but can't downshift as the the value
522    //  isn't a power of 2. So multiply by 65536 / n
523    //  and take the upper 16 bits.
524    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
525    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
526
527    // Align for table lookup, vtbl requires registers to
528    //  be adjacent
529
530    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
531
532    MEMACCESS(1)
533    "st1       {v3.8b}, [%1], #8                       \n"
534    MEMACCESS(1)
535    "st1       {v3.s}[2], [%1], #4                     \n"
536    "b.gt      1b                                      \n"
537  : "+r"(src_ptr),         // %0
538    "+r"(dst_ptr),         // %1
539    "+r"(tmp_src_stride),  // %2
540    "+r"(dst_width)        // %3
541  : "r"(&kMult38_Div6),    // %4
542    "r"(&kShuf38_2)        // %5
543  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
544    "v18", "v19", "v30", "v31", "memory", "cc"
545  );
546}
547
548void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
549                    uint16* dst_ptr, int src_width, int src_height) {
550  const uint8* src_tmp;
551  asm volatile (
552  "1:                                          \n"
553    "mov       %0, %1                          \n"
554    "mov       w12, %w5                        \n"
555    "eor       v2.16b, v2.16b, v2.16b          \n"
556    "eor       v3.16b, v3.16b, v3.16b          \n"
557  "2:                                          \n"
558    // load 16 pixels into q0
559    MEMACCESS(0)
560    "ld1       {v0.16b}, [%0], %3              \n"
561    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
562    "uaddw     v2.8h, v2.8h, v0.8b             \n"
563    "subs      w12, w12, #1                    \n"
564    "b.gt      2b                              \n"
565    MEMACCESS(2)
566    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
567    "add      %1, %1, #16                      \n"
568    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
569    "b.gt     1b                               \n"
570  : "=&r"(src_tmp),    // %0
571    "+r"(src_ptr),     // %1
572    "+r"(dst_ptr),     // %2
573    "+r"(src_stride),  // %3
574    "+r"(src_width),   // %4
575    "+r"(src_height)   // %5
576  :
577  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
578  );
579}
580
581// TODO(Yang Zhang): Investigate less load instructions for
582// the x/dx stepping
583#define LOAD2_DATA8_LANE(n)                                    \
584    "lsr        %5, %3, #16                    \n"             \
585    "add        %6, %1, %5                    \n"              \
586    "add        %3, %3, %4                     \n"             \
587    MEMACCESS(6)                                               \
588    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
589
590void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
591                          int dst_width, int x, int dx) {
592  int dx_offset[4] = {0, 1, 2, 3};
593  int* tmp = dx_offset;
594  const uint8* src_tmp = src_ptr;
595  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
596  int64 x64 = (int64) x;
597  int64 dx64 = (int64) dx;
598  asm volatile (
599    "dup        v0.4s, %w3                     \n"  // x
600    "dup        v1.4s, %w4                     \n"  // dx
601    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
602    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
603    "mul        v1.4s, v1.4s, v2.4s            \n"
604    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
605    "add        v1.4s, v1.4s, v0.4s            \n"
606    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
607    "add        v2.4s, v1.4s, v3.4s            \n"
608    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
609  "1:                                          \n"
610    LOAD2_DATA8_LANE(0)
611    LOAD2_DATA8_LANE(1)
612    LOAD2_DATA8_LANE(2)
613    LOAD2_DATA8_LANE(3)
614    LOAD2_DATA8_LANE(4)
615    LOAD2_DATA8_LANE(5)
616    LOAD2_DATA8_LANE(6)
617    LOAD2_DATA8_LANE(7)
618    "mov       v6.16b, v1.16b                  \n"
619    "mov       v7.16b, v2.16b                  \n"
620    "uzp1      v6.8h, v6.8h, v7.8h             \n"
621    "ushll     v4.8h, v4.8b, #0                \n"
622    "ushll     v5.8h, v5.8b, #0                \n"
623    "ssubl     v16.4s, v5.4h, v4.4h            \n"
624    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
625    "ushll     v7.4s, v6.4h, #0                \n"
626    "ushll2    v6.4s, v6.8h, #0                \n"
627    "mul       v16.4s, v16.4s, v7.4s           \n"
628    "mul       v17.4s, v17.4s, v6.4s           \n"
629    "rshrn      v6.4h, v16.4s, #16             \n"
630    "rshrn2     v6.8h, v17.4s, #16             \n"
631    "add       v4.8h, v4.8h, v6.8h             \n"
632    "xtn       v4.8b, v4.8h                    \n"
633
634    MEMACCESS(0)
635    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
636    "add       v1.4s, v1.4s, v0.4s             \n"
637    "add       v2.4s, v2.4s, v0.4s             \n"
638    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
639    "b.gt      1b                              \n"
640  : "+r"(dst_ptr),          // %0
641    "+r"(src_ptr),          // %1
642    "+r"(dst_width64),      // %2
643    "+r"(x64),              // %3
644    "+r"(dx64),             // %4
645    "+r"(tmp),              // %5
646    "+r"(src_tmp)           // %6
647  :
648  : "memory", "cc", "v0", "v1", "v2", "v3",
649    "v4", "v5", "v6", "v7", "v16", "v17"
650  );
651}
652
653#undef LOAD2_DATA8_LANE
654
655// 16x2 -> 16x1
656void ScaleFilterRows_NEON(uint8* dst_ptr,
657                          const uint8* src_ptr, ptrdiff_t src_stride,
658                          int dst_width, int source_y_fraction) {
659    int y_fraction = 256 - source_y_fraction;
660  asm volatile (
661    "cmp          %w4, #0                      \n"
662    "b.eq         100f                         \n"
663    "add          %2, %2, %1                   \n"
664    "cmp          %w4, #64                     \n"
665    "b.eq         75f                          \n"
666    "cmp          %w4, #128                    \n"
667    "b.eq         50f                          \n"
668    "cmp          %w4, #192                    \n"
669    "b.eq         25f                          \n"
670
671    "dup          v5.8b, %w4                   \n"
672    "dup          v4.8b, %w5                   \n"
673    // General purpose row blend.
674  "1:                                          \n"
675    MEMACCESS(1)
676    "ld1          {v0.16b}, [%1], #16          \n"
677    MEMACCESS(2)
678    "ld1          {v1.16b}, [%2], #16          \n"
679    "subs         %w3, %w3, #16                \n"
680    "umull        v6.8h, v0.8b, v4.8b          \n"
681    "umull2       v7.8h, v0.16b, v4.16b        \n"
682    "umlal        v6.8h, v1.8b, v5.8b          \n"
683    "umlal2       v7.8h, v1.16b, v5.16b        \n"
684    "rshrn        v0.8b, v6.8h, #8             \n"
685    "rshrn2       v0.16b, v7.8h, #8            \n"
686    MEMACCESS(0)
687    "st1          {v0.16b}, [%0], #16          \n"
688    "b.gt         1b                           \n"
689    "b            99f                          \n"
690
691    // Blend 25 / 75.
692  "25:                                         \n"
693    MEMACCESS(1)
694    "ld1          {v0.16b}, [%1], #16          \n"
695    MEMACCESS(2)
696    "ld1          {v1.16b}, [%2], #16          \n"
697    "subs         %w3, %w3, #16                \n"
698    "urhadd       v0.16b, v0.16b, v1.16b       \n"
699    "urhadd       v0.16b, v0.16b, v1.16b       \n"
700    MEMACCESS(0)
701    "st1          {v0.16b}, [%0], #16          \n"
702    "b.gt         25b                          \n"
703    "b            99f                          \n"
704
705    // Blend 50 / 50.
706  "50:                                         \n"
707    MEMACCESS(1)
708    "ld1          {v0.16b}, [%1], #16          \n"
709    MEMACCESS(2)
710    "ld1          {v1.16b}, [%2], #16          \n"
711    "subs         %w3, %w3, #16                \n"
712    "urhadd       v0.16b, v0.16b, v1.16b       \n"
713    MEMACCESS(0)
714    "st1          {v0.16b}, [%0], #16          \n"
715    "b.gt         50b                          \n"
716    "b            99f                          \n"
717
718    // Blend 75 / 25.
719  "75:                                         \n"
720    MEMACCESS(1)
721    "ld1          {v1.16b}, [%1], #16          \n"
722    MEMACCESS(2)
723    "ld1          {v0.16b}, [%2], #16          \n"
724    "subs         %w3, %w3, #16                \n"
725    "urhadd       v0.16b, v0.16b, v1.16b       \n"
726    "urhadd       v0.16b, v0.16b, v1.16b       \n"
727    MEMACCESS(0)
728    "st1          {v0.16b}, [%0], #16          \n"
729    "b.gt         75b                          \n"
730    "b            99f                          \n"
731
732    // Blend 100 / 0 - Copy row unchanged.
733  "100:                                        \n"
734    MEMACCESS(1)
735    "ld1          {v0.16b}, [%1], #16          \n"
736    "subs         %w3, %w3, #16                \n"
737    MEMACCESS(0)
738    "st1          {v0.16b}, [%0], #16          \n"
739    "b.gt         100b                         \n"
740
741  "99:                                         \n"
742    MEMACCESS(0)
743    "st1          {v0.b}[15], [%0]             \n"
744  : "+r"(dst_ptr),          // %0
745    "+r"(src_ptr),          // %1
746    "+r"(src_stride),       // %2
747    "+r"(dst_width),        // %3
748    "+r"(source_y_fraction),// %4
749    "+r"(y_fraction)        // %5
750  :
751  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
752  );
753}
754
755void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
756                            uint8* dst, int dst_width) {
757  asm volatile (
758  "1:                                          \n"
759    // load even pixels into q0, odd into q1
760    MEMACCESS (0)
761    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
762    MEMACCESS (0)
763    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
764    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
765    MEMACCESS (1)
766    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
767    MEMACCESS (1)
768    "st1        {v3.16b}, [%1], #16            \n"
769    "b.gt       1b                             \n"
770  : "+r" (src_ptr),          // %0
771    "+r" (dst),              // %1
772    "+r" (dst_width)         // %2
773  :
774  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
775  );
776}
777
778void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
779                                  uint8* dst_argb, int dst_width) {
780  asm volatile (
781  "1:                                          \n"
782    MEMACCESS (0)
783    // load 8 ARGB pixels.
784    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
785    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
786    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
787    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
788    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
789    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
790    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
791    "rshrn      v1.8b, v1.8h, #1               \n"
792    "rshrn      v2.8b, v2.8h, #1               \n"
793    "rshrn      v3.8b, v3.8h, #1               \n"
794    MEMACCESS (1)
795    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
796    "b.gt       1b                             \n"
797  : "+r"(src_argb),         // %0
798    "+r"(dst_argb),         // %1
799    "+r"(dst_width)         // %2
800  :
801  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
802  );
803}
804
805void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
806                               uint8* dst, int dst_width) {
807  asm volatile (
808    // change the stride to row 2 pointer
809    "add        %1, %1, %0                     \n"
810  "1:                                          \n"
811    MEMACCESS (0)
812    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
813    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
814    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
815    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
816    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
817    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
818    MEMACCESS (1)
819    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
820    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
821    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
822    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
823    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
824    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
825    "rshrn      v1.8b, v1.8h, #2               \n"
826    "rshrn      v2.8b, v2.8h, #2               \n"
827    "rshrn      v3.8b, v3.8h, #2               \n"
828    MEMACCESS (2)
829    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
830    "b.gt       1b                             \n"
831  : "+r" (src_ptr),          // %0
832    "+r" (src_stride),       // %1
833    "+r" (dst),              // %2
834    "+r" (dst_width)         // %3
835  :
836  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
837  );
838}
839
840// Reads 4 pixels at a time.
841// Alignment requirement: src_argb 4 byte aligned.
842void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
843                               int src_stepx, uint8* dst_argb, int dst_width) {
844  asm volatile (
845  "1:                                          \n"
846    MEMACCESS(0)
847    "ld1        {v0.s}[0], [%0], %3            \n"
848    MEMACCESS(0)
849    "ld1        {v0.s}[1], [%0], %3            \n"
850    MEMACCESS(0)
851    "ld1        {v0.s}[2], [%0], %3            \n"
852    MEMACCESS(0)
853    "ld1        {v0.s}[3], [%0], %3            \n"
854    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
855    MEMACCESS(1)
856    "st1        {v0.16b}, [%1], #16            \n"
857    "b.gt       1b                             \n"
858  : "+r"(src_argb),    // %0
859    "+r"(dst_argb),    // %1
860    "+r"(dst_width)    // %2
861  : "r"((int64)(src_stepx * 4)) // %3
862  : "memory", "cc", "v0"
863  );
864}
865
866// Reads 4 pixels at a time.
867// Alignment requirement: src_argb 4 byte aligned.
868// TODO(Yang Zhang): Might be worth another optimization pass in future.
869// It could be upgraded to 8 pixels at a time to start with.
870void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
871                                  int src_stepx,
872                                  uint8* dst_argb, int dst_width) {
873  asm volatile (
874    "add        %1, %1, %0                     \n"
875  "1:                                          \n"
876    MEMACCESS(0)
877    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
878    MEMACCESS(1)
879    "ld1        {v1.8b}, [%1], %4              \n"
880    MEMACCESS(0)
881    "ld1        {v2.8b}, [%0], %4              \n"
882    MEMACCESS(1)
883    "ld1        {v3.8b}, [%1], %4              \n"
884    MEMACCESS(0)
885    "ld1        {v4.8b}, [%0], %4              \n"
886    MEMACCESS(1)
887    "ld1        {v5.8b}, [%1], %4              \n"
888    MEMACCESS(0)
889    "ld1        {v6.8b}, [%0], %4              \n"
890    MEMACCESS(1)
891    "ld1        {v7.8b}, [%1], %4              \n"
892    "uaddl      v0.8h, v0.8b, v1.8b            \n"
893    "uaddl      v2.8h, v2.8b, v3.8b            \n"
894    "uaddl      v4.8h, v4.8b, v5.8b            \n"
895    "uaddl      v6.8h, v6.8b, v7.8b            \n"
896    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
897    "mov        v0.d[1], v2.d[0]               \n"
898    "mov        v2.d[0], v16.d[1]              \n"
899    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
900    "mov        v4.d[1], v6.d[0]               \n"
901    "mov        v6.d[0], v16.d[1]              \n"
902    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
903    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
904    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
905    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
906    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
907    MEMACCESS(2)
908    "st1     {v0.16b}, [%2], #16               \n"
909    "b.gt       1b                             \n"
910  : "+r"(src_argb),    // %0
911    "+r"(src_stride),  // %1
912    "+r"(dst_argb),    // %2
913    "+r"(dst_width)    // %3
914  : "r"((int64)(src_stepx * 4)) // %4
915  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
916  );
917}
918
919// TODO(Yang Zhang): Investigate less load instructions for
920// the x/dx stepping
921#define LOAD1_DATA32_LANE(vn, n)                               \
922    "lsr        %5, %3, #16                    \n"             \
923    "add        %6, %1, %5, lsl #2             \n"             \
924    "add        %3, %3, %4                     \n"             \
925    MEMACCESS(6)                                               \
926    "ld1        {"#vn".s}["#n"], [%6]          \n"
927
928void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
929                        int dst_width, int x, int dx) {
930  const uint8* src_tmp = src_argb;
931  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
932  int64 x64 = (int64) x;
933  int64 dx64 = (int64) dx;
934  int64 tmp64;
935  asm volatile (
936  "1:                                          \n"
937    LOAD1_DATA32_LANE(v0, 0)
938    LOAD1_DATA32_LANE(v0, 1)
939    LOAD1_DATA32_LANE(v0, 2)
940    LOAD1_DATA32_LANE(v0, 3)
941    LOAD1_DATA32_LANE(v1, 0)
942    LOAD1_DATA32_LANE(v1, 1)
943    LOAD1_DATA32_LANE(v1, 2)
944    LOAD1_DATA32_LANE(v1, 3)
945
946    MEMACCESS(0)
947    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
948    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
949    "b.gt        1b                            \n"
950  : "+r"(dst_argb),     // %0
951    "+r"(src_argb),     // %1
952    "+r"(dst_width64),  // %2
953    "+r"(x64),          // %3
954    "+r"(dx64),         // %4
955    "=&r"(tmp64),       // %5
956    "+r"(src_tmp)       // %6
957  :
958  : "memory", "cc", "v0", "v1"
959  );
960}
961
962#undef LOAD1_DATA32_LANE
963
964// TODO(Yang Zhang): Investigate less load instructions for
965// the x/dx stepping
966#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
967    "lsr        %5, %3, #16                           \n"      \
968    "add        %6, %1, %5, lsl #2                    \n"      \
969    "add        %3, %3, %4                            \n"      \
970    MEMACCESS(6)                                               \
971    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
972
973void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
974                              int dst_width, int x, int dx) {
975  int dx_offset[4] = {0, 1, 2, 3};
976  int* tmp = dx_offset;
977  const uint8* src_tmp = src_argb;
978  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
979  int64 x64 = (int64) x;
980  int64 dx64 = (int64) dx;
981  asm volatile (
982    "dup        v0.4s, %w3                     \n"  // x
983    "dup        v1.4s, %w4                     \n"  // dx
984    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
985    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
986    "mul        v1.4s, v1.4s, v2.4s            \n"
987    "movi       v3.16b, #0x7f                  \n"  // 0x7F
988    "movi       v4.8h, #0x7f                   \n"  // 0x7F
989    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
990    "add        v5.4s, v1.4s, v0.4s            \n"
991  "1:                                          \n"
992    // d0, d1: a
993    // d2, d3: b
994    LOAD2_DATA32_LANE(v0, v1, 0)
995    LOAD2_DATA32_LANE(v0, v1, 1)
996    LOAD2_DATA32_LANE(v0, v1, 2)
997    LOAD2_DATA32_LANE(v0, v1, 3)
998    "shrn       v2.4h, v5.4s, #9               \n"
999    "and        v2.8b, v2.8b, v4.8b            \n"
1000    "dup        v16.8b, v2.b[0]                \n"
1001    "dup        v17.8b, v2.b[2]                \n"
1002    "dup        v18.8b, v2.b[4]                \n"
1003    "dup        v19.8b, v2.b[6]                \n"
1004    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
1005    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
1006    "ins        v2.d[1], v17.d[0]              \n"  // f
1007    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
1008    "umull      v16.8h, v0.8b, v7.8b           \n"
1009    "umull2     v17.8h, v0.16b, v7.16b         \n"
1010    "umull      v18.8h, v1.8b, v2.8b           \n"
1011    "umull2     v19.8h, v1.16b, v2.16b         \n"
1012    "add        v16.8h, v16.8h, v18.8h         \n"
1013    "add        v17.8h, v17.8h, v19.8h         \n"
1014    "shrn       v0.8b, v16.8h, #7              \n"
1015    "shrn2      v0.16b, v17.8h, #7             \n"
1016
1017    MEMACCESS(0)
1018    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
1019    "add     v5.4s, v5.4s, v6.4s               \n"
1020    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
1021    "b.gt    1b                                \n"
1022  : "+r"(dst_argb),         // %0
1023    "+r"(src_argb),         // %1
1024    "+r"(dst_width64),      // %2
1025    "+r"(x64),              // %3
1026    "+r"(dx64),             // %4
1027    "+r"(tmp),              // %5
1028    "+r"(src_tmp)           // %6
1029  :
1030  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1031    "v6", "v7", "v16", "v17", "v18", "v19"
1032  );
1033}
1034
1035#undef LOAD2_DATA32_LANE
1036
1037#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1038
1039#ifdef __cplusplus
1040}  // extern "C"
1041}  // namespace libyuv
1042#endif
1043