1// Copyright 2014 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10//  NEON common code.
11
12#ifndef WEBP_DSP_NEON_H_
13#define WEBP_DSP_NEON_H_
14
15#include <arm_neon.h>
16
17#include "./dsp.h"
18
19// Right now, some intrinsics functions seem slower, so we disable them
20// everywhere except aarch64 where the inline assembly is incompatible.
21#if defined(__aarch64__)
22#define USE_INTRINSICS   // use intrinsics when possible
23#endif
24
25#define INIT_VECTOR2(v, a, b) do {  \
26  v.val[0] = a;                     \
27  v.val[1] = b;                     \
28} while (0)
29
30#define INIT_VECTOR3(v, a, b, c) do {  \
31  v.val[0] = a;                        \
32  v.val[1] = b;                        \
33  v.val[2] = c;                        \
34} while (0)
35
36#define INIT_VECTOR4(v, a, b, c, d) do {  \
37  v.val[0] = a;                           \
38  v.val[1] = b;                           \
39  v.val[2] = c;                           \
40  v.val[3] = d;                           \
41} while (0)
42
43// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
44// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
45// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
46#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
47#define WORK_AROUND_GCC
48#endif
49
50static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
51  uint64x2x2_t row01, row23;
52
53  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
54  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
55  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
56  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
57  // Transpose 64-bit values (there's no vswp equivalent)
58  {
59    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
60    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
61    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
62    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
63    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
64    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
65    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
66    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
67  }
68  {
69    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
70                                        vreinterpretq_s32_u64(row01.val[1]));
71    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
72                                        vreinterpretq_s32_u64(row23.val[1]));
73    int32x4x4_t out;
74    out.val[0] = out01.val[0];
75    out.val[1] = out01.val[1];
76    out.val[2] = out23.val[0];
77    out.val[3] = out23.val[1];
78    return out;
79  }
80}
81
82#endif  // WEBP_DSP_NEON_H_
83