neon.h revision af51b94a435132e9014c324e25fb686b3d07a8c8
1ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// Copyright 2014 Google Inc. All Rights Reserved. 2ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// 3ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// Use of this source code is governed by a BSD-style license 4ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// that can be found in the COPYING file in the root of the source 5ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// tree. An additional intellectual property rights grant can be found 6ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// in the file PATENTS. All contributing project authors may 7ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// be found in the AUTHORS file in the root of the source tree. 8ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// ----------------------------------------------------------------------------- 9ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// 10ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// NEON common code. 11ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu 12ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#ifndef WEBP_DSP_NEON_H_ 13ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define WEBP_DSP_NEON_H_ 14ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu 1569355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis#include <arm_neon.h> 1655fc873017f10f6f566b182b70f6fc22aefa3464Chandler Carruth 17ec8605f1d7ec846dbf51047bfd5c56d32d1ff91cArgyrios Kyrtzidis#include "./dsp.h" 1869355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis 1969355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis// Right now, some intrinsics functions seem slower, so we disable them 209b663716449b618ba0390b1dbebc54fa8e971124Ted Kremenek// everywhere except aarch64 where the inline assembly is incompatible. 21ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#if defined(__aarch64__) 22ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define USE_INTRINSICS // use intrinsics when possible 239ef6537a894c33003359b1f9b9676e9178e028b7Ted Kremenek#endif 24ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu 25ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define INIT_VECTOR2(v, a, b) do { \ 26ba5fb5a955c896815c439289fc51c03cf0635129Kovarththanan Rajaratnam v.val[0] = a; \ 27ec8605f1d7ec846dbf51047bfd5c56d32d1ff91cArgyrios Kyrtzidis v.val[1] = b; \ 28651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines} while (0) 29651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines 30ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define INIT_VECTOR3(v, a, b, c) do { \ 3169355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis v.val[0] = a; \ 32ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu v.val[1] = b; \ 33ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu v.val[2] = c; \ 34ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu} while (0) 3569355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis 3669355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis#define INIT_VECTOR4(v, a, b, c, d) do { \ 378bef8238181a30e52dea380789a7e2d760eac532Ted Kremenek v.val[0] = a; \ 38ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu v.val[1] = b; \ 39ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu v.val[2] = c; \ 40ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu v.val[3] = d; \ 41ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu} while (0) 42ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu 435eca482fe895ea57bc82410222e6426c09e63284Ted Kremenek// if using intrinsics, this flag avoids some functions that make gcc-4.6.3 44ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// crash ("internal compiler error: in immed_double_const, at emit-rtl."). 45ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) 46ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) 47ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define WORK_AROUND_GCC 486f516f50e53b621613d281ef186c76c5160d9d35Ted Kremenek#endif 49ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu 507a95de68c093991047ed8d339479ccad51b88663David Blaikiestatic WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) { 51b1dbe0ee0d2e766067ab5a30daf89b2743ebbe43Zhongxing Xu uint64x2x2_t row01, row23; 52b1dbe0ee0d2e766067ab5a30daf89b2743ebbe43Zhongxing Xu 53b1dbe0ee0d2e766067ab5a30daf89b2743ebbe43Zhongxing Xu row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); 54b1dbe0ee0d2e766067ab5a30daf89b2743ebbe43Zhongxing Xu row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); 55b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); 56b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); 57ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu // Transpose 64-bit values (there's no vswp equivalent) 58e884ff88baa1bd61db273baf107862a2110058edZhongxing Xu { 593ed04d37573c566205d965d2e91d54ccae898d0aZhongxing Xu const uint64x1_t row0h = vget_high_u64(row01.val[0]); 60018220c343c103b7dfaa117a7a474c7a7fd6d068Zhongxing Xu const uint64x1_t row2l = vget_low_u64(row23.val[0]); 61ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu const uint64x1_t row1h = vget_high_u64(row01.val[1]); 628bef8238181a30e52dea380789a7e2d760eac532Ted Kremenek const uint64x1_t row3l = vget_low_u64(row23.val[1]); 638bef8238181a30e52dea380789a7e2d760eac532Ted Kremenek row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); 64ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); 65d048c6ef5b6cfaa0cecb8cc1d4bdace32ed21d07Ted Kremenek row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); 66ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); 67ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu } 68ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu { 69ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), 70b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu vreinterpretq_s32_u64(row01.val[1])); 71b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), 72ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu vreinterpretq_s32_u64(row23.val[1])); 73651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines int32x4x4_t out; 74651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines out.val[0] = out01.val[0]; 75651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines out.val[1] = out01.val[1]; 76651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines out.val[2] = out23.val[0]; 776f516f50e53b621613d281ef186c76c5160d9d35Ted Kremenek out.val[3] = out23.val[1]; 78b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu return out; 79b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu } 80b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu} 816f516f50e53b621613d281ef186c76c5160d9d35Ted Kremenek 82ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#endif // WEBP_DSP_NEON_H_ 83e172e8b9e7fc67d7d03589af7e92fe777afcf33aAnna Zaks