neon.h revision af51b94a435132e9014c324e25fb686b3d07a8c8
1ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// Copyright 2014 Google Inc. All Rights Reserved.
2ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu//
3ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// Use of this source code is governed by a BSD-style license
4ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// that can be found in the COPYING file in the root of the source
5ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// tree. An additional intellectual property rights grant can be found
6ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// in the file PATENTS. All contributing project authors may
7ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// be found in the AUTHORS file in the root of the source tree.
8ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// -----------------------------------------------------------------------------
9ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu//
10ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu//  NEON common code.
11ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu
12ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#ifndef WEBP_DSP_NEON_H_
13ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define WEBP_DSP_NEON_H_
14ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu
1569355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis#include <arm_neon.h>
1655fc873017f10f6f566b182b70f6fc22aefa3464Chandler Carruth
17ec8605f1d7ec846dbf51047bfd5c56d32d1ff91cArgyrios Kyrtzidis#include "./dsp.h"
1869355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis
1969355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis// Right now, some intrinsics functions seem slower, so we disable them
209b663716449b618ba0390b1dbebc54fa8e971124Ted Kremenek// everywhere except aarch64 where the inline assembly is incompatible.
21ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#if defined(__aarch64__)
22ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define USE_INTRINSICS   // use intrinsics when possible
239ef6537a894c33003359b1f9b9676e9178e028b7Ted Kremenek#endif
24ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu
25ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define INIT_VECTOR2(v, a, b) do {  \
26ba5fb5a955c896815c439289fc51c03cf0635129Kovarththanan Rajaratnam  v.val[0] = a;                     \
27ec8605f1d7ec846dbf51047bfd5c56d32d1ff91cArgyrios Kyrtzidis  v.val[1] = b;                     \
28651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines} while (0)
29651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines
30ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define INIT_VECTOR3(v, a, b, c) do {  \
3169355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis  v.val[0] = a;                        \
32ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu  v.val[1] = b;                        \
33ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu  v.val[2] = c;                        \
34ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu} while (0)
3569355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis
3669355798abdbe5e78d1185af7d4600b9355b5814Argyrios Kyrtzidis#define INIT_VECTOR4(v, a, b, c, d) do {  \
378bef8238181a30e52dea380789a7e2d760eac532Ted Kremenek  v.val[0] = a;                           \
38ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu  v.val[1] = b;                           \
39ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu  v.val[2] = c;                           \
40ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu  v.val[3] = d;                           \
41ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu} while (0)
42ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu
435eca482fe895ea57bc82410222e6426c09e63284Ted Kremenek// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
44ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
45ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
46ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
47ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#define WORK_AROUND_GCC
486f516f50e53b621613d281ef186c76c5160d9d35Ted Kremenek#endif
49ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu
507a95de68c093991047ed8d339479ccad51b88663David Blaikiestatic WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
51b1dbe0ee0d2e766067ab5a30daf89b2743ebbe43Zhongxing Xu  uint64x2x2_t row01, row23;
52b1dbe0ee0d2e766067ab5a30daf89b2743ebbe43Zhongxing Xu
53b1dbe0ee0d2e766067ab5a30daf89b2743ebbe43Zhongxing Xu  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
54b1dbe0ee0d2e766067ab5a30daf89b2743ebbe43Zhongxing Xu  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
55b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
56b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
57ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu  // Transpose 64-bit values (there's no vswp equivalent)
58e884ff88baa1bd61db273baf107862a2110058edZhongxing Xu  {
593ed04d37573c566205d965d2e91d54ccae898d0aZhongxing Xu    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
60018220c343c103b7dfaa117a7a474c7a7fd6d068Zhongxing Xu    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
61ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
628bef8238181a30e52dea380789a7e2d760eac532Ted Kremenek    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
638bef8238181a30e52dea380789a7e2d760eac532Ted Kremenek    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
64ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
65d048c6ef5b6cfaa0cecb8cc1d4bdace32ed21d07Ted Kremenek    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
66ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
67ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu  }
68ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu  {
69ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
70b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu                                        vreinterpretq_s32_u64(row01.val[1]));
71b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
72ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu                                        vreinterpretq_s32_u64(row23.val[1]));
73651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    int32x4x4_t out;
74651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    out.val[0] = out01.val[0];
75651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    out.val[1] = out01.val[1];
76651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    out.val[2] = out23.val[0];
776f516f50e53b621613d281ef186c76c5160d9d35Ted Kremenek    out.val[3] = out23.val[1];
78b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu    return out;
79b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu  }
80b991f48ccff0567d581cf95e4eda1bffd5bbada3Zhongxing Xu}
816f516f50e53b621613d281ef186c76c5160d9d35Ted Kremenek
82ceeb02db9ad4232ea248a44192180d5bc7fe2653Zhongxing Xu#endif  // WEBP_DSP_NEON_H_
83e172e8b9e7fc67d7d03589af7e92fe777afcf33aAnna Zaks