1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC Neon.
19#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
20
21// NEON downscalers with interpolation.
22// Provided by Fritz Koenig
23
24// Read 32x1 throw away even pixels, and write 16x1.
25void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
26                        uint8* dst, int dst_width) {
27  asm volatile (
28    ".p2align   2                              \n"
29  "1:                                          \n"
30    // load even pixels into q0, odd into q1
31    MEMACCESS(0)
32    "vld2.8     {q0, q1}, [%0]!                \n"
33    "subs       %2, %2, #16                    \n"  // 16 processed per loop
34    MEMACCESS(1)
35    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
36    "bgt        1b                             \n"
37  : "+r"(src_ptr),          // %0
38    "+r"(dst),              // %1
39    "+r"(dst_width)         // %2
40  :
41  : "q0", "q1"              // Clobber List
42  );
43}
44
45// Read 32x2 average down and write 16x1.
46void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
47                           uint8* dst, int dst_width) {
48  asm volatile (
49    // change the stride to row 2 pointer
50    "add        %1, %0                         \n"
51    ".p2align   2                              \n"
52  "1:                                          \n"
53    MEMACCESS(0)
54    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
55    MEMACCESS(1)
56    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
57    "subs       %3, %3, #16                    \n"  // 16 processed per loop
58    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
59    "vpaddl.u8  q1, q1                         \n"
60    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
61    "vpadal.u8  q1, q3                         \n"
62    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
63    "vrshrn.u16 d1, q1, #2                     \n"
64    MEMACCESS(2)
65    "vst1.8     {q0}, [%2]!                    \n"
66    "bgt        1b                             \n"
67  : "+r"(src_ptr),          // %0
68    "+r"(src_stride),       // %1
69    "+r"(dst),              // %2
70    "+r"(dst_width)         // %3
71  :
72  : "q0", "q1", "q2", "q3"     // Clobber List
73  );
74}
75
76void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
77                        uint8* dst_ptr, int dst_width) {
78  asm volatile (
79    ".p2align   2                              \n"
80  "1:                                          \n"
81    MEMACCESS(0)
82    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
83    "subs       %2, %2, #8                     \n" // 8 processed per loop
84    MEMACCESS(1)
85    "vst1.8     {d2}, [%1]!                    \n"
86    "bgt        1b                             \n"
87  : "+r"(src_ptr),          // %0
88    "+r"(dst_ptr),          // %1
89    "+r"(dst_width)         // %2
90  :
91  : "q0", "q1", "memory", "cc"
92  );
93}
94
95void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
96                           uint8* dst_ptr, int dst_width) {
97  const uint8* src_ptr1 = src_ptr + src_stride;
98  const uint8* src_ptr2 = src_ptr + src_stride * 2;
99  const uint8* src_ptr3 = src_ptr + src_stride * 3;
100asm volatile (
101    ".p2align   2                              \n"
102  "1:                                          \n"
103    MEMACCESS(0)
104    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
105    MEMACCESS(3)
106    "vld1.8     {q1}, [%3]!                    \n"
107    MEMACCESS(4)
108    "vld1.8     {q2}, [%4]!                    \n"
109    MEMACCESS(5)
110    "vld1.8     {q3}, [%5]!                    \n"
111    "subs       %2, %2, #4                     \n"
112    "vpaddl.u8  q0, q0                         \n"
113    "vpadal.u8  q0, q1                         \n"
114    "vpadal.u8  q0, q2                         \n"
115    "vpadal.u8  q0, q3                         \n"
116    "vpaddl.u16 q0, q0                         \n"
117    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
118    "vmovn.u16  d0, q0                         \n"
119    MEMACCESS(1)
120    "vst1.32    {d0[0]}, [%1]!                 \n"
121    "bgt        1b                             \n"
122  : "+r"(src_ptr),   // %0
123    "+r"(dst_ptr),   // %1
124    "+r"(dst_width), // %2
125    "+r"(src_ptr1),  // %3
126    "+r"(src_ptr2),  // %4
127    "+r"(src_ptr3)   // %5
128  :
129  : "q0", "q1", "q2", "q3", "memory", "cc"
130  );
131}
132
133// Down scale from 4 to 3 pixels. Use the neon multilane read/write
134// to load up the every 4th pixel into a 4 different registers.
135// Point samples 32 pixels to 24 pixels.
136void ScaleRowDown34_NEON(const uint8* src_ptr,
137                         ptrdiff_t src_stride,
138                         uint8* dst_ptr, int dst_width) {
139  asm volatile (
140    ".p2align   2                              \n"
141  "1:                                          \n"
142    MEMACCESS(0)
143    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
144    "subs       %2, %2, #24                  \n"
145    "vmov       d2, d3                       \n" // order d0, d1, d2
146    MEMACCESS(1)
147    "vst3.8     {d0, d1, d2}, [%1]!          \n"
148    "bgt        1b                           \n"
149  : "+r"(src_ptr),          // %0
150    "+r"(dst_ptr),          // %1
151    "+r"(dst_width)         // %2
152  :
153  : "d0", "d1", "d2", "d3", "memory", "cc"
154  );
155}
156
157void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
158                               ptrdiff_t src_stride,
159                               uint8* dst_ptr, int dst_width) {
160  asm volatile (
161    "vmov.u8    d24, #3                        \n"
162    "add        %3, %0                         \n"
163    ".p2align   2                              \n"
164  "1:                                          \n"
165    MEMACCESS(0)
166    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
167    MEMACCESS(3)
168    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
169    "subs         %2, %2, #24                  \n"
170
171    // filter src line 0 with src line 1
172    // expand chars to shorts to allow for room
173    // when adding lines together
174    "vmovl.u8     q8, d4                       \n"
175    "vmovl.u8     q9, d5                       \n"
176    "vmovl.u8     q10, d6                      \n"
177    "vmovl.u8     q11, d7                      \n"
178
179    // 3 * line_0 + line_1
180    "vmlal.u8     q8, d0, d24                  \n"
181    "vmlal.u8     q9, d1, d24                  \n"
182    "vmlal.u8     q10, d2, d24                 \n"
183    "vmlal.u8     q11, d3, d24                 \n"
184
185    // (3 * line_0 + line_1) >> 2
186    "vqrshrn.u16  d0, q8, #2                   \n"
187    "vqrshrn.u16  d1, q9, #2                   \n"
188    "vqrshrn.u16  d2, q10, #2                  \n"
189    "vqrshrn.u16  d3, q11, #2                  \n"
190
191    // a0 = (src[0] * 3 + s[1] * 1) >> 2
192    "vmovl.u8     q8, d1                       \n"
193    "vmlal.u8     q8, d0, d24                  \n"
194    "vqrshrn.u16  d0, q8, #2                   \n"
195
196    // a1 = (src[1] * 1 + s[2] * 1) >> 1
197    "vrhadd.u8    d1, d1, d2                   \n"
198
199    // a2 = (src[2] * 1 + s[3] * 3) >> 2
200    "vmovl.u8     q8, d2                       \n"
201    "vmlal.u8     q8, d3, d24                  \n"
202    "vqrshrn.u16  d2, q8, #2                   \n"
203
204    MEMACCESS(1)
205    "vst3.8       {d0, d1, d2}, [%1]!          \n"
206
207    "bgt          1b                           \n"
208  : "+r"(src_ptr),          // %0
209    "+r"(dst_ptr),          // %1
210    "+r"(dst_width),        // %2
211    "+r"(src_stride)        // %3
212  :
213  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
214  );
215}
216
217void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
218                               ptrdiff_t src_stride,
219                               uint8* dst_ptr, int dst_width) {
220  asm volatile (
221    "vmov.u8    d24, #3                        \n"
222    "add        %3, %0                         \n"
223    ".p2align   2                              \n"
224  "1:                                          \n"
225    MEMACCESS(0)
226    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
227    MEMACCESS(3)
228    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
229    "subs         %2, %2, #24                  \n"
230    // average src line 0 with src line 1
231    "vrhadd.u8    q0, q0, q2                   \n"
232    "vrhadd.u8    q1, q1, q3                   \n"
233
234    // a0 = (src[0] * 3 + s[1] * 1) >> 2
235    "vmovl.u8     q3, d1                       \n"
236    "vmlal.u8     q3, d0, d24                  \n"
237    "vqrshrn.u16  d0, q3, #2                   \n"
238
239    // a1 = (src[1] * 1 + s[2] * 1) >> 1
240    "vrhadd.u8    d1, d1, d2                   \n"
241
242    // a2 = (src[2] * 1 + s[3] * 3) >> 2
243    "vmovl.u8     q3, d2                       \n"
244    "vmlal.u8     q3, d3, d24                  \n"
245    "vqrshrn.u16  d2, q3, #2                   \n"
246
247    MEMACCESS(1)
248    "vst3.8       {d0, d1, d2}, [%1]!          \n"
249    "bgt          1b                           \n"
250  : "+r"(src_ptr),          // %0
251    "+r"(dst_ptr),          // %1
252    "+r"(dst_width),        // %2
253    "+r"(src_stride)        // %3
254  :
255  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
256  );
257}
258
259#define HAS_SCALEROWDOWN38_NEON
260static uvec8 kShuf38 =
261  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
262static uvec8 kShuf38_2 =
263  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
264static vec16 kMult38_Div6 =
265  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
266    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
267static vec16 kMult38_Div9 =
268  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
269    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
270
271// 32 -> 12
272void ScaleRowDown38_NEON(const uint8* src_ptr,
273                         ptrdiff_t src_stride,
274                         uint8* dst_ptr, int dst_width) {
275  asm volatile (
276    MEMACCESS(3)
277    "vld1.8     {q3}, [%3]                     \n"
278    ".p2align   2                              \n"
279  "1:                                          \n"
280    MEMACCESS(0)
281    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
282    "subs       %2, %2, #12                    \n"
283    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
284    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
285    MEMACCESS(1)
286    "vst1.8     {d4}, [%1]!                    \n"
287    MEMACCESS(1)
288    "vst1.32    {d5[0]}, [%1]!                 \n"
289    "bgt        1b                             \n"
290  : "+r"(src_ptr),          // %0
291    "+r"(dst_ptr),          // %1
292    "+r"(dst_width)         // %2
293  : "r"(&kShuf38)           // %3
294  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
295  );
296}
297
298// 32x3 -> 12x1
299void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
300                                      ptrdiff_t src_stride,
301                                      uint8* dst_ptr, int dst_width) {
302  const uint8* src_ptr1 = src_ptr + src_stride * 2;
303
304  asm volatile (
305    MEMACCESS(5)
306    "vld1.16    {q13}, [%5]                    \n"
307    MEMACCESS(6)
308    "vld1.8     {q14}, [%6]                    \n"
309    MEMACCESS(7)
310    "vld1.8     {q15}, [%7]                    \n"
311    "add        %3, %0                         \n"
312    ".p2align   2                              \n"
313  "1:                                          \n"
314
315    // d0 = 00 40 01 41 02 42 03 43
316    // d1 = 10 50 11 51 12 52 13 53
317    // d2 = 20 60 21 61 22 62 23 63
318    // d3 = 30 70 31 71 32 72 33 73
319    MEMACCESS(0)
320    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
321    MEMACCESS(3)
322    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
323    MEMACCESS(4)
324    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
325    "subs         %2, %2, #12                  \n"
326
327    // Shuffle the input data around to get align the data
328    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
329    // d0 = 00 10 01 11 02 12 03 13
330    // d1 = 40 50 41 51 42 52 43 53
331    "vtrn.u8      d0, d1                       \n"
332    "vtrn.u8      d4, d5                       \n"
333    "vtrn.u8      d16, d17                     \n"
334
335    // d2 = 20 30 21 31 22 32 23 33
336    // d3 = 60 70 61 71 62 72 63 73
337    "vtrn.u8      d2, d3                       \n"
338    "vtrn.u8      d6, d7                       \n"
339    "vtrn.u8      d18, d19                     \n"
340
341    // d0 = 00+10 01+11 02+12 03+13
342    // d2 = 40+50 41+51 42+52 43+53
343    "vpaddl.u8    q0, q0                       \n"
344    "vpaddl.u8    q2, q2                       \n"
345    "vpaddl.u8    q8, q8                       \n"
346
347    // d3 = 60+70 61+71 62+72 63+73
348    "vpaddl.u8    d3, d3                       \n"
349    "vpaddl.u8    d7, d7                       \n"
350    "vpaddl.u8    d19, d19                     \n"
351
352    // combine source lines
353    "vadd.u16     q0, q2                       \n"
354    "vadd.u16     q0, q8                       \n"
355    "vadd.u16     d4, d3, d7                   \n"
356    "vadd.u16     d4, d19                      \n"
357
358    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
359    //             + s[6 + st * 1] + s[7 + st * 1]
360    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
361    "vqrdmulh.s16 q2, q2, q13                  \n"
362    "vmovn.u16    d4, q2                       \n"
363
364    // Shuffle 2,3 reg around so that 2 can be added to the
365    //  0,1 reg and 3 can be added to the 4,5 reg. This
366    //  requires expanding from u8 to u16 as the 0,1 and 4,5
367    //  registers are already expanded. Then do transposes
368    //  to get aligned.
369    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
370    "vmovl.u8     q1, d2                       \n"
371    "vmovl.u8     q3, d6                       \n"
372    "vmovl.u8     q9, d18                      \n"
373
374    // combine source lines
375    "vadd.u16     q1, q3                       \n"
376    "vadd.u16     q1, q9                       \n"
377
378    // d4 = xx 20 xx 30 xx 22 xx 32
379    // d5 = xx 21 xx 31 xx 23 xx 33
380    "vtrn.u32     d2, d3                       \n"
381
382    // d4 = xx 20 xx 21 xx 22 xx 23
383    // d5 = xx 30 xx 31 xx 32 xx 33
384    "vtrn.u16     d2, d3                       \n"
385
386    // 0+1+2, 3+4+5
387    "vadd.u16     q0, q1                       \n"
388
389    // Need to divide, but can't downshift as the the value
390    //  isn't a power of 2. So multiply by 65536 / n
391    //  and take the upper 16 bits.
392    "vqrdmulh.s16 q0, q0, q15                  \n"
393
394    // Align for table lookup, vtbl requires registers to
395    //  be adjacent
396    "vmov.u8      d2, d4                       \n"
397
398    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
399    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
400
401    MEMACCESS(1)
402    "vst1.8       {d3}, [%1]!                  \n"
403    MEMACCESS(1)
404    "vst1.32      {d4[0]}, [%1]!               \n"
405    "bgt          1b                           \n"
406  : "+r"(src_ptr),          // %0
407    "+r"(dst_ptr),          // %1
408    "+r"(dst_width),        // %2
409    "+r"(src_stride),       // %3
410    "+r"(src_ptr1)          // %4
411  : "r"(&kMult38_Div6),     // %5
412    "r"(&kShuf38_2),        // %6
413    "r"(&kMult38_Div9)      // %7
414  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
415  );
416}
417
418// 32x2 -> 12x1
419void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
420                               ptrdiff_t src_stride,
421                               uint8* dst_ptr, int dst_width) {
422  asm volatile (
423    MEMACCESS(4)
424    "vld1.16    {q13}, [%4]                    \n"
425    MEMACCESS(5)
426    "vld1.8     {q14}, [%5]                    \n"
427    "add        %3, %0                         \n"
428    ".p2align   2                              \n"
429  "1:                                          \n"
430
431    // d0 = 00 40 01 41 02 42 03 43
432    // d1 = 10 50 11 51 12 52 13 53
433    // d2 = 20 60 21 61 22 62 23 63
434    // d3 = 30 70 31 71 32 72 33 73
435    MEMACCESS(0)
436    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
437    MEMACCESS(3)
438    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
439    "subs         %2, %2, #12                  \n"
440
441    // Shuffle the input data around to get align the data
442    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
443    // d0 = 00 10 01 11 02 12 03 13
444    // d1 = 40 50 41 51 42 52 43 53
445    "vtrn.u8      d0, d1                       \n"
446    "vtrn.u8      d4, d5                       \n"
447
448    // d2 = 20 30 21 31 22 32 23 33
449    // d3 = 60 70 61 71 62 72 63 73
450    "vtrn.u8      d2, d3                       \n"
451    "vtrn.u8      d6, d7                       \n"
452
453    // d0 = 00+10 01+11 02+12 03+13
454    // d2 = 40+50 41+51 42+52 43+53
455    "vpaddl.u8    q0, q0                       \n"
456    "vpaddl.u8    q2, q2                       \n"
457
458    // d3 = 60+70 61+71 62+72 63+73
459    "vpaddl.u8    d3, d3                       \n"
460    "vpaddl.u8    d7, d7                       \n"
461
462    // combine source lines
463    "vadd.u16     q0, q2                       \n"
464    "vadd.u16     d4, d3, d7                   \n"
465
466    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
467    "vqrshrn.u16  d4, q2, #2                   \n"
468
469    // Shuffle 2,3 reg around so that 2 can be added to the
470    //  0,1 reg and 3 can be added to the 4,5 reg. This
471    //  requires expanding from u8 to u16 as the 0,1 and 4,5
472    //  registers are already expanded. Then do transposes
473    //  to get aligned.
474    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
475    "vmovl.u8     q1, d2                       \n"
476    "vmovl.u8     q3, d6                       \n"
477
478    // combine source lines
479    "vadd.u16     q1, q3                       \n"
480
481    // d4 = xx 20 xx 30 xx 22 xx 32
482    // d5 = xx 21 xx 31 xx 23 xx 33
483    "vtrn.u32     d2, d3                       \n"
484
485    // d4 = xx 20 xx 21 xx 22 xx 23
486    // d5 = xx 30 xx 31 xx 32 xx 33
487    "vtrn.u16     d2, d3                       \n"
488
489    // 0+1+2, 3+4+5
490    "vadd.u16     q0, q1                       \n"
491
492    // Need to divide, but can't downshift as the the value
493    //  isn't a power of 2. So multiply by 65536 / n
494    //  and take the upper 16 bits.
495    "vqrdmulh.s16 q0, q0, q13                  \n"
496
497    // Align for table lookup, vtbl requires registers to
498    //  be adjacent
499    "vmov.u8      d2, d4                       \n"
500
501    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
502    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
503
504    MEMACCESS(1)
505    "vst1.8       {d3}, [%1]!                  \n"
506    MEMACCESS(1)
507    "vst1.32      {d4[0]}, [%1]!               \n"
508    "bgt          1b                           \n"
509  : "+r"(src_ptr),       // %0
510    "+r"(dst_ptr),       // %1
511    "+r"(dst_width),     // %2
512    "+r"(src_stride)     // %3
513  : "r"(&kMult38_Div6),  // %4
514    "r"(&kShuf38_2)      // %5
515  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
516  );
517}
518
519// 16x2 -> 16x1
520void ScaleFilterRows_NEON(uint8* dst_ptr,
521                          const uint8* src_ptr, ptrdiff_t src_stride,
522                          int dst_width, int source_y_fraction) {
523  asm volatile (
524    "cmp          %4, #0                       \n"
525    "beq          100f                         \n"
526    "add          %2, %1                       \n"
527    "cmp          %4, #64                      \n"
528    "beq          75f                          \n"
529    "cmp          %4, #128                     \n"
530    "beq          50f                          \n"
531    "cmp          %4, #192                     \n"
532    "beq          25f                          \n"
533
534    "vdup.8       d5, %4                       \n"
535    "rsb          %4, #256                     \n"
536    "vdup.8       d4, %4                       \n"
537    // General purpose row blend.
538  "1:                                          \n"
539    MEMACCESS(1)
540    "vld1.8       {q0}, [%1]!                  \n"
541    MEMACCESS(2)
542    "vld1.8       {q1}, [%2]!                  \n"
543    "subs         %3, %3, #16                  \n"
544    "vmull.u8     q13, d0, d4                  \n"
545    "vmull.u8     q14, d1, d4                  \n"
546    "vmlal.u8     q13, d2, d5                  \n"
547    "vmlal.u8     q14, d3, d5                  \n"
548    "vrshrn.u16   d0, q13, #8                  \n"
549    "vrshrn.u16   d1, q14, #8                  \n"
550    MEMACCESS(0)
551    "vst1.8       {q0}, [%0]!                  \n"
552    "bgt          1b                           \n"
553    "b            99f                          \n"
554
555    // Blend 25 / 75.
556  "25:                                         \n"
557    MEMACCESS(1)
558    "vld1.8       {q0}, [%1]!                  \n"
559    MEMACCESS(2)
560    "vld1.8       {q1}, [%2]!                  \n"
561    "subs         %3, %3, #16                  \n"
562    "vrhadd.u8    q0, q1                       \n"
563    "vrhadd.u8    q0, q1                       \n"
564    MEMACCESS(0)
565    "vst1.8       {q0}, [%0]!                  \n"
566    "bgt          25b                          \n"
567    "b            99f                          \n"
568
569    // Blend 50 / 50.
570  "50:                                         \n"
571    MEMACCESS(1)
572    "vld1.8       {q0}, [%1]!                  \n"
573    MEMACCESS(2)
574    "vld1.8       {q1}, [%2]!                  \n"
575    "subs         %3, %3, #16                  \n"
576    "vrhadd.u8    q0, q1                       \n"
577    MEMACCESS(0)
578    "vst1.8       {q0}, [%0]!                  \n"
579    "bgt          50b                          \n"
580    "b            99f                          \n"
581
582    // Blend 75 / 25.
583  "75:                                         \n"
584    MEMACCESS(1)
585    "vld1.8       {q1}, [%1]!                  \n"
586    MEMACCESS(2)
587    "vld1.8       {q0}, [%2]!                  \n"
588    "subs         %3, %3, #16                  \n"
589    "vrhadd.u8    q0, q1                       \n"
590    "vrhadd.u8    q0, q1                       \n"
591    MEMACCESS(0)
592    "vst1.8       {q0}, [%0]!                  \n"
593    "bgt          75b                          \n"
594    "b            99f                          \n"
595
596    // Blend 100 / 0 - Copy row unchanged.
597  "100:                                        \n"
598    MEMACCESS(1)
599    "vld1.8       {q0}, [%1]!                  \n"
600    "subs         %3, %3, #16                  \n"
601    MEMACCESS(0)
602    "vst1.8       {q0}, [%0]!                  \n"
603    "bgt          100b                         \n"
604
605  "99:                                         \n"
606    MEMACCESS(0)
607    "vst1.8       {d1[7]}, [%0]                \n"
608  : "+r"(dst_ptr),          // %0
609    "+r"(src_ptr),          // %1
610    "+r"(src_stride),       // %2
611    "+r"(dst_width),        // %3
612    "+r"(source_y_fraction) // %4
613  :
614  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
615  );
616}
617
618void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
619                            uint8* dst, int dst_width) {
620  asm volatile (
621    ".p2align   2                              \n"
622  "1:                                          \n"
623    // load even pixels into q0, odd into q1
624    MEMACCESS(0)
625    "vld2.32    {q0, q1}, [%0]!                \n"
626    MEMACCESS(0)
627    "vld2.32    {q2, q3}, [%0]!                \n"
628    "subs       %2, %2, #8                     \n"  // 8 processed per loop
629    MEMACCESS(1)
630    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
631    MEMACCESS(1)
632    "vst1.8     {q3}, [%1]!                    \n"
633    "bgt        1b                             \n"
634  : "+r"(src_ptr),          // %0
635    "+r"(dst),              // %1
636    "+r"(dst_width)         // %2
637  :
638  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
639  );
640}
641
642void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
643                               uint8* dst, int dst_width) {
644  asm volatile (
645    // change the stride to row 2 pointer
646    "add        %1, %1, %0                     \n"
647    ".p2align   2                              \n"
648  "1:                                          \n"
649    MEMACCESS(0)
650    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
651    MEMACCESS(0)
652    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
653    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
654    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
655    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
656    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
657    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
658    MEMACCESS(1)
659    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
660    MEMACCESS(1)
661    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
662    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
663    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
664    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
665    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
666    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
667    "vrshrn.u16 d1, q1, #2                     \n"
668    "vrshrn.u16 d2, q2, #2                     \n"
669    "vrshrn.u16 d3, q3, #2                     \n"
670    MEMACCESS(2)
671    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
672    "bgt        1b                             \n"
673  : "+r"(src_ptr),          // %0
674    "+r"(src_stride),       // %1
675    "+r"(dst),              // %2
676    "+r"(dst_width)         // %3
677  :
678  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
679  );
680}
681
682// Reads 4 pixels at a time.
683// Alignment requirement: src_argb 4 byte aligned.
684void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
685                               int src_stepx, uint8* dst_argb, int dst_width) {
686  asm volatile (
687    "mov        r12, %3, lsl #2                \n"
688    ".p2align   2                              \n"
689  "1:                                          \n"
690    MEMACCESS(0)
691    "vld1.32    {d0[0]}, [%0], r12             \n"
692    MEMACCESS(0)
693    "vld1.32    {d0[1]}, [%0], r12             \n"
694    MEMACCESS(0)
695    "vld1.32    {d1[0]}, [%0], r12             \n"
696    MEMACCESS(0)
697    "vld1.32    {d1[1]}, [%0], r12             \n"
698    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
699    MEMACCESS(1)
700    "vst1.8     {q0}, [%1]!                    \n"
701    "bgt        1b                             \n"
702  : "+r"(src_argb),    // %0
703    "+r"(dst_argb),    // %1
704    "+r"(dst_width)    // %2
705  : "r"(src_stepx)     // %3
706  : "memory", "cc", "r12", "q0"
707  );
708}
709
710// Reads 4 pixels at a time.
711// Alignment requirement: src_argb 4 byte aligned.
712void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
713                                  int src_stepx,
714                                  uint8* dst_argb, int dst_width) {
715  asm volatile (
716    "mov        r12, %4, lsl #2                \n"
717    "add        %1, %1, %0                     \n"
718    ".p2align   2                              \n"
719  "1:                                          \n"
720    MEMACCESS(0)
721    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
722    MEMACCESS(1)
723    "vld1.8     {d1}, [%1], r12                \n"
724    MEMACCESS(0)
725    "vld1.8     {d2}, [%0], r12                \n"
726    MEMACCESS(1)
727    "vld1.8     {d3}, [%1], r12                \n"
728    MEMACCESS(0)
729    "vld1.8     {d4}, [%0], r12                \n"
730    MEMACCESS(1)
731    "vld1.8     {d5}, [%1], r12                \n"
732    MEMACCESS(0)
733    "vld1.8     {d6}, [%0], r12                \n"
734    MEMACCESS(1)
735    "vld1.8     {d7}, [%1], r12                \n"
736    "vaddl.u8   q0, d0, d1                     \n"
737    "vaddl.u8   q1, d2, d3                     \n"
738    "vaddl.u8   q2, d4, d5                     \n"
739    "vaddl.u8   q3, d6, d7                     \n"
740    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
741    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
742    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
743    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
744    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
745    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
746    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
747    MEMACCESS(2)
748    "vst1.8     {q0}, [%2]!                    \n"
749    "bgt        1b                             \n"
750  : "+r"(src_argb),    // %0
751    "+r"(src_stride),  // %1
752    "+r"(dst_argb),    // %2
753    "+r"(dst_width)    // %3
754  : "r"(src_stepx)     // %4
755  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
756  );
757}
758
759#endif  // __ARM_NEON__
760
761#ifdef __cplusplus
762}  // extern "C"
763}  // namespace libyuv
764#endif
765