133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp/* 233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * Copyright 2011 The LibYuv Project Authors. All rights reserved. 333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * 433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * Use of this source code is governed by a BSD-style license 533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * that can be found in the LICENSE file in the root of the source 633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * tree. An additional intellectual property rights grant can be found 733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * in the file PATENTS. All contributing project authors may 833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * be found in the AUTHORS file in the root of the source tree. 933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp */ 1033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 1133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/basic_types.h" 1233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h" 1333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus 1533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampnamespace libyuv { 1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampextern "C" { 1733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 1933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// This module is for GCC Neon 2033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) 2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 2233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp/** 2333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * NEON downscalers with interpolation. 2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * 2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * Provided by Fritz Koenig 2633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * 2733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp */ 2833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, 3033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst, int dst_width) { 3133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 3233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // load even pixels into q0, odd into q1 3433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld2.u8 {q0,q1}, [%0]! \n" 3533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {q0}, [%1]! \n" // store even pixels 3633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, %2, #16 \n" // 16 processed per loop 3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 3933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst), // %1 4033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width) // %2 4133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : 4233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "q0", "q1" // Clobber List 4333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 4433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 4533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 4733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst, int dst_width) { 4833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 4933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // change the stride to row 2 pointer 5033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add %1, %0 \n" 5133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 5233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc 5333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc 5433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 q0, q0 \n" // row 1 add adjacent 5533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 q1, q1 \n" 5633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 5733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpadal.u8 q1, q3 \n" 5833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack 5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrshrn.u16 d1, q1, #2 \n" 6033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {q0}, [%2]! \n" 6133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %3, %3, #16 \n" // 16 processed per loop 6233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 6433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(src_stride), // %1 6533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst), // %2 6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width) // %3 6733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : 6833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "q0", "q1", "q2", "q3" // Clobber List 6933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 7033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 7133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 7233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, 7333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_ptr, int dst_width) { 7433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 7533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 7633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld2.u8 {d0, d1}, [%0]! \n" 7733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d1, d0 \n" 7833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vshrn.u16 d0, q0, #8 \n" 7933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u32 {d0[1]}, [%1]! \n" 8033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, #4 \n" 8133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 8233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 8333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_ptr), // %1 8433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width) // %2 8533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : 8633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "q0", "q1", "memory", "cc" 8733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 8833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 8933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 9033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 9133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_ptr, int dst_width) { 9233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 9333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add r4, %0, %3 \n" 9433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add r5, r4, %3 \n" 9533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add %3, r5, %3 \n" 9633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 9733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q0}, [%0]! \n" // load up 16x4 9833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q1}, [r4]! \n" 9933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q2}, [r5]! \n" 10033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q3}, [%3]! \n" 10133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 q0, q0 \n" 10233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpadal.u8 q0, q1 \n" 10333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpadal.u8 q0, q2 \n" 10433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpadal.u8 q0, q3 \n" 10533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u16 q0, q0 \n" 10633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding 10733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovn.u16 d0, q0 \n" 10833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u32 {d0[0]}, [%1]! \n" 10933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, #4 \n" 11033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 11133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 11233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_ptr), // %1 11333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width) // %2 11433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "r"(src_stride) // %3 11533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" 11633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 11733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 11833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 11933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Down scale from 4 to 3 pixels. Use the neon multilane read/write 12033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// to load up the every 4th pixel into a 4 different registers. 12133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Point samples 32 pixels to 24 pixels. 12233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown34_NEON(const uint8* src_ptr, 12333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ptrdiff_t /* src_stride */, 12433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_ptr, int dst_width) { 12533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 12633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 12733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 12833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmov d2, d3 \n" // order d0, d1, d2 12933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst3.u8 {d0, d1, d2}, [%1]! \n" 13033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, #24 \n" 13133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 13233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 13333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_ptr), // %1 13433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width) // %2 13533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : 13633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "d0", "d1", "d2", "d3", "memory", "cc" 13733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 13833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 13933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 14033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, 14133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ptrdiff_t src_stride, 14233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_ptr, int dst_width) { 14333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 14433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmov.u8 d24, #3 \n" 14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add %3, %0 \n" 14633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 14733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 14833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 14933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 15033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // filter src line 0 with src line 1 15133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // expand chars to shorts to allow for room 15233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // when adding lines together 15333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q8, d4 \n" 15433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q9, d5 \n" 15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q10, d6 \n" 15633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q11, d7 \n" 15733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 15833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 3 * line_0 + line_1 15933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q8, d0, d24 \n" 16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q9, d1, d24 \n" 16133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q10, d2, d24 \n" 16233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q11, d3, d24 \n" 16333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 16433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // (3 * line_0 + line_1) >> 2 16533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d0, q8, #2 \n" 16633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d1, q9, #2 \n" 16733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d2, q10, #2 \n" 16833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d3, q11, #2 \n" 16933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 17033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // a0 = (src[0] * 3 + s[1] * 1) >> 2 17133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q8, d1 \n" 17233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q8, d0, d24 \n" 17333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d0, q8, #2 \n" 17433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 17533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // a1 = (src[1] * 1 + s[2] * 1) >> 1 17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrhadd.u8 d1, d1, d2 \n" 17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // a2 = (src[2] * 1 + s[3] * 3) >> 2 17933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q8, d2 \n" 18033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q8, d3, d24 \n" 18133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d2, q8, #2 \n" 18233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 18333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst3.u8 {d0, d1, d2}, [%1]! \n" 18433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 18533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, #24 \n" 18633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 18733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 18833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_ptr), // %1 18933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width), // %2 19033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(src_stride) // %3 19133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : 19233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" 19333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 19433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 19533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 19633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, 19733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ptrdiff_t src_stride, 19833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_ptr, int dst_width) { 19933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 20033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmov.u8 d24, #3 \n" 20133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add %3, %0 \n" 20233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 20333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 20433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 20533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 20633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // average src line 0 with src line 1 20733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrhadd.u8 q0, q0, q2 \n" 20833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrhadd.u8 q1, q1, q3 \n" 20933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 21033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // a0 = (src[0] * 3 + s[1] * 1) >> 2 21133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q3, d1 \n" 21233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q3, d0, d24 \n" 21333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d0, q3, #2 \n" 21433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 21533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // a1 = (src[1] * 1 + s[2] * 1) >> 1 21633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrhadd.u8 d1, d1, d2 \n" 21733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 21833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // a2 = (src[2] * 1 + s[3] * 3) >> 2 21933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q3, d2 \n" 22033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q3, d3, d24 \n" 22133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d2, q3, #2 \n" 22233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 22333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst3.u8 {d0, d1, d2}, [%1]! \n" 22433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 22533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, #24 \n" 22633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 22733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 22833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_ptr), // %1 22933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width), // %2 23033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(src_stride) // %3 23133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : 23233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" 23333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 23433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 23533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 23633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_SCALEROWDOWN38_NEON 23733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampconst uvec8 kShuf38 = 23833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; 23933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampconst uvec8 kShuf38_2 = 24033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; 24133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampconst vec16 kMult38_Div6 = 24233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 24333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; 24433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampconst vec16 kMult38_Div9 = 24533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 24633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; 24733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 24833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 32 -> 12 24933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown38_NEON(const uint8* src_ptr, 25033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ptrdiff_t /* src_stride */, 25133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_ptr, int dst_width) { 25233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 25333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q3}, [%3] \n" 25433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 25533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" 25633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" 25733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" 25833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {d4}, [%1]! \n" 25933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u32 {d5[0]}, [%1]! \n" 26033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, #12 \n" 26133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 26233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 26333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_ptr), // %1 26433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width) // %2 26533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "r"(&kShuf38) // %3 26633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" 26733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 26933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 27033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 32x3 -> 12x1 27133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, 27233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ptrdiff_t src_stride, 27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_ptr, int dst_width) { 27433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 27533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u16 {q13}, [%4] \n" 27633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q14}, [%5] \n" 27733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q15}, [%6] \n" 27833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add r4, %0, %3, lsl #1 \n" 27933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add %3, %0 \n" 28033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 28133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 28233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d0 = 00 40 01 41 02 42 03 43 28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d1 = 10 50 11 51 12 52 13 53 28433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d2 = 20 60 21 61 22 62 23 63 28533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d3 = 30 70 31 71 32 72 33 73 28633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" 28733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" 28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" 28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Shuffle the input data around to get align the data 29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d0 = 00 10 01 11 02 12 03 13 29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d1 = 40 50 41 51 42 52 43 53 29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d0, d1 \n" 29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d4, d5 \n" 29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d16, d17 \n" 29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d2 = 20 30 21 31 22 32 23 33 29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d3 = 60 70 61 71 62 72 63 73 30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d2, d3 \n" 30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d6, d7 \n" 30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d18, d19 \n" 30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d0 = 00+10 01+11 02+12 03+13 30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d2 = 40+50 41+51 42+52 43+53 30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 q0, q0 \n" 30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 q2, q2 \n" 30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 q8, q8 \n" 30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d3 = 60+70 61+71 62+72 63+73 31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 d3, d3 \n" 31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 d7, d7 \n" 31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 d19, d19 \n" 31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // combine source lines 31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 q0, q2 \n" 31733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 q0, q8 \n" 31833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 d4, d3, d7 \n" 31933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 d4, d19 \n" 32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // + s[6 + st * 1] + s[7 + st * 1] 32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // + s[6 + st * 2] + s[7 + st * 2]) / 6 32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrdmulh.s16 q2, q2, q13 \n" 32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovn.u16 d4, q2 \n" 32633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Shuffle 2,3 reg around so that 2 can be added to the 32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 0,1 reg and 3 can be added to the 4,5 reg. This 32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // requires expanding from u8 to u16 as the 0,1 and 4,5 33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // registers are already expanded. Then do transposes 33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // to get aligned. 33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q1, d2 \n" 33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q3, d6 \n" 33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q9, d18 \n" 33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // combine source lines 33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 q1, q3 \n" 33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 q1, q9 \n" 34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d4 = xx 20 xx 30 xx 22 xx 32 34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d5 = xx 21 xx 31 xx 23 xx 33 34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u32 d2, d3 \n" 34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d4 = xx 20 xx 21 xx 22 xx 23 34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d5 = xx 30 xx 31 xx 32 xx 33 34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u16 d2, d3 \n" 34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 0+1+2, 3+4+5 35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 q0, q1 \n" 35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Need to divide, but can't downshift as the the value 35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // isn't a power of 2. So multiply by 65536 / n 35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // and take the upper 16 bits. 35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrdmulh.s16 q0, q0, q15 \n" 35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Align for table lookup, vtbl requires registers to 35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // be adjacent 35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmov.u8 d2, d4 \n" 36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {d3}, [%1]! \n" 36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u32 {d4[0]}, [%1]! \n" 36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, #12 \n" 36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_ptr), // %1 37033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width), // %2 37133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(src_stride) // %3 37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "r"(&kMult38_Div6), // %4 37333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "r"(&kShuf38_2), // %5 37433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "r"(&kMult38_Div9) // %6 37533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "r4", "q0", "q1", "q2", "q3", "q8", "q9", 37633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "q13", "q14", "q15", "memory", "cc" 37733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 32x2 -> 12x1 38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, 38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ptrdiff_t src_stride, 38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_ptr, int dst_width) { 38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u16 {q13}, [%4] \n" 38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q14}, [%5] \n" 38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add %3, %0 \n" 38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d0 = 00 40 01 41 02 42 03 43 39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d1 = 10 50 11 51 12 52 13 53 39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d2 = 20 60 21 61 22 62 23 63 39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d3 = 30 70 31 71 32 72 33 73 39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" 39533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" 39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Shuffle the input data around to get align the data 39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d0 = 00 10 01 11 02 12 03 13 40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d1 = 40 50 41 51 42 52 43 53 40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d0, d1 \n" 40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d4, d5 \n" 40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d2 = 20 30 21 31 22 32 23 33 40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d3 = 60 70 61 71 62 72 63 73 40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d2, d3 \n" 40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u8 d6, d7 \n" 40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d0 = 00+10 01+11 02+12 03+13 41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d2 = 40+50 41+51 42+52 43+53 41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 q0, q0 \n" 41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 q2, q2 \n" 41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d3 = 60+70 61+71 62+72 63+73 41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 d3, d3 \n" 41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vpaddl.u8 d7, d7 \n" 41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // combine source lines 41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 q0, q2 \n" 42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 d4, d3, d7 \n" 42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrshrn.u16 d4, q2, #2 \n" 42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Shuffle 2,3 reg around so that 2 can be added to the 42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 0,1 reg and 3 can be added to the 4,5 reg. This 42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // requires expanding from u8 to u16 as the 0,1 and 4,5 42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // registers are already expanded. Then do transposes 42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // to get aligned. 43033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q1, d2 \n" 43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmovl.u8 q3, d6 \n" 43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // combine source lines 43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 q1, q3 \n" 43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d4 = xx 20 xx 30 xx 22 xx 32 43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d5 = xx 21 xx 31 xx 23 xx 33 43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u32 d2, d3 \n" 44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d4 = xx 20 xx 21 xx 22 xx 23 44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // d5 = xx 30 xx 31 xx 32 xx 33 44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtrn.u16 d2, d3 \n" 44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 0+1+2, 3+4+5 44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vadd.u16 q0, q1 \n" 44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Need to divide, but can't downshift as the the value 44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // isn't a power of 2. So multiply by 65536 / n 45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // and take the upper 16 bits. 45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vqrdmulh.s16 q0, q0, q13 \n" 45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Align for table lookup, vtbl requires registers to 45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // be adjacent 45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmov.u8 d2, d4 \n" 45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {d3}, [%1]! \n" 46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u32 {d4[0]}, [%1]! \n" 46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %2, #12 \n" 46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_ptr), // %0 46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_ptr), // %1 46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width), // %2 46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(src_stride) // %3 46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "r"(&kMult38_Div6), // %4 46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "r"(&kShuf38_2) // %5 47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" 47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 16x2 -> 16x1 47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ScaleFilterRows_NEON(uint8* dst_ptr, 47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_ptr, ptrdiff_t src_stride, 47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int dst_width, int source_y_fraction) { 47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "cmp %4, #0 \n" 48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "beq 2f \n" 48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "add %2, %1 \n" 48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "cmp %4, #128 \n" 48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "beq 3f \n" 48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vdup.8 d5, %4 \n" 48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "rsb %4, #256 \n" 48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vdup.8 d4, %4 \n" 48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q0}, [%1]! \n" 49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q1}, [%2]! \n" 49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %3, #16 \n" 49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmull.u8 q13, d0, d4 \n" 49333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmull.u8 q14, d1, d4 \n" 49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q13, d2, d5 \n" 49533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vmlal.u8 q14, d3, d5 \n" 49633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrshrn.u16 d0, q13, #8 \n" 49733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrshrn.u16 d1, q14, #8 \n" 49833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {q0}, [%0]! \n" 49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 1b \n" 50033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "b 4f \n" 50133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "2: \n" 50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q0}, [%1]! \n" 50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %3, #16 \n" 50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {q0}, [%0]! \n" 50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 2b \n" 50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "b 4f \n" 50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "3: \n" 51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q0}, [%1]! \n" 51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vld1.u8 {q1}, [%2]! \n" 51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "subs %3, #16 \n" 51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vrhadd.u8 q0, q1 \n" 51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {q0}, [%0]! \n" 51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "bgt 3b \n" 51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "4: \n" 51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "vst1.u8 {d1[7]}, [%0] \n" 51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(dst_ptr), // %0 51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(src_ptr), // %1 52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(src_stride), // %2 52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(dst_width), // %3 52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(source_y_fraction) // %4 52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : 52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" 52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // __ARM_NEON__ 52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus 53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} // extern "C" 53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} // namespace libyuv 53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 535