1// Copyright 2014 Google Inc. All Rights Reserved. 2// 3// Use of this source code is governed by a BSD-style license 4// that can be found in the COPYING file in the root of the source 5// tree. An additional intellectual property rights grant can be found 6// in the file PATENTS. All contributing project authors may 7// be found in the AUTHORS file in the root of the source tree. 8// ----------------------------------------------------------------------------- 9// 10// NEON common code. 11 12#ifndef WEBP_DSP_NEON_H_ 13#define WEBP_DSP_NEON_H_ 14 15#include <arm_neon.h> 16 17#include "./dsp.h" 18 19// Right now, some intrinsics functions seem slower, so we disable them 20// everywhere except aarch64 where the inline assembly is incompatible. 21#if defined(__aarch64__) 22#define USE_INTRINSICS // use intrinsics when possible 23#endif 24 25#define INIT_VECTOR2(v, a, b) do { \ 26 v.val[0] = a; \ 27 v.val[1] = b; \ 28} while (0) 29 30#define INIT_VECTOR3(v, a, b, c) do { \ 31 v.val[0] = a; \ 32 v.val[1] = b; \ 33 v.val[2] = c; \ 34} while (0) 35 36#define INIT_VECTOR4(v, a, b, c, d) do { \ 37 v.val[0] = a; \ 38 v.val[1] = b; \ 39 v.val[2] = c; \ 40 v.val[3] = d; \ 41} while (0) 42 43// if using intrinsics, this flag avoids some functions that make gcc-4.6.3 44// crash ("internal compiler error: in immed_double_const, at emit-rtl."). 45// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) 46#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) 47#define WORK_AROUND_GCC 48#endif 49 50static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) { 51 uint64x2x2_t row01, row23; 52 53 row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); 54 row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); 55 row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); 56 row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); 57 // Transpose 64-bit values (there's no vswp equivalent) 58 { 59 const uint64x1_t row0h = vget_high_u64(row01.val[0]); 60 const uint64x1_t row2l = vget_low_u64(row23.val[0]); 61 const uint64x1_t row1h = vget_high_u64(row01.val[1]); 62 const uint64x1_t row3l = vget_low_u64(row23.val[1]); 63 row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); 64 row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); 65 row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); 66 row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); 67 } 68 { 69 const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), 70 vreinterpretq_s32_u64(row01.val[1])); 71 const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), 72 vreinterpretq_s32_u64(row23.val[1])); 73 int32x4x4_t out; 74 out.val[0] = out01.val[0]; 75 out.val[1] = out01.val[1]; 76 out.val[2] = out23.val[0]; 77 out.val[3] = out23.val[1]; 78 return out; 79 } 80} 81 82#endif // WEBP_DSP_NEON_H_ 83