11e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Copyright 2012 Google Inc. All Rights Reserved.
21e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
30406ce1417f76f2034833414dcecc9f56253640cVikas Arora// Use of this source code is governed by a BSD-style license
40406ce1417f76f2034833414dcecc9f56253640cVikas Arora// that can be found in the COPYING file in the root of the source
50406ce1417f76f2034833414dcecc9f56253640cVikas Arora// tree. An additional intellectual property rights grant can be found
60406ce1417f76f2034833414dcecc9f56253640cVikas Arora// in the file PATENTS. All contributing project authors may
70406ce1417f76f2034833414dcecc9f56253640cVikas Arora// be found in the AUTHORS file in the root of the source tree.
81e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// -----------------------------------------------------------------------------
91e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// ARM NEON version of speed-critical encoding functions.
111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from libvpx (http://www.webmproject.org/code/)
131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "./dsp.h"
151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON)
171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
18af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include <assert.h>
19af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
20af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "./neon.h"
211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "../enc/vp8enci.h"
221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Transforms (Paragraph 14.4)
251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Inverse transform.
27af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// This code is pretty much the same as TransformOne in the dec_neon.c, except
281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// for subtraction to *ref. See the comments there for algorithmic explanations.
29af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
30af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic const int16_t kC1 = 20091;
31af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
32af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
33af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// This code works but is *slower* than the inlined-asm version below
34af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// USE_INTRINSICS define.
36af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// With gcc-4.8, it's a little faster speed than inlined-assembly.
37af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(USE_INTRINSICS)
38af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
39af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
40af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
41af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
43af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
44af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// to the corresponding rows of 'dst'.
46af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
47af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                            const int16x8_t dst01,
48af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                            const int16x8_t dst23) {
49af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Unsigned saturate to 8b.
50af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
53af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Store the results.
54af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
55af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
56af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
57af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
58af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
59af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
60af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
61af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                               const uint8_t* const ref, uint8_t* const dst) {
62af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint32x2_t dst01 = vdup_n_u32(0);
63af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint32x2_t dst23 = vdup_n_u32(0);
64af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
65af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Load the source pixels.
66af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
67af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
68af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
69af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
70af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
71af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  {
72af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // Convert to 16b.
73af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
74af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
75af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
76af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // Descale with rounding.
77af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
78af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
79af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // Add the inverse transform.
80af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    SaturateAndStore4x4(dst, out01, out23);
81af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
82af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
83af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
84af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
85af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                     int16x8x2_t* const out) {
86af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
87af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
88af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
89af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                                  // b0 d0 b1 d1 b2 d2 ...
90af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
91af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
92af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
93af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
94af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // {rows} = in0 | in4
95af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  //          in8 | in12
96af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // B1 = in4 | in12
97af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t B1 =
98af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
99af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // C0 = kC1 * in4 | kC1 * in12
100af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // C1 = kC2 * in4 | kC2 * in12
101af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
102af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
103af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
104af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                vget_low_s16(rows->val[1]));   // in0 + in8
105af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
106af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                vget_low_s16(rows->val[1]));   // in0 - in8
107af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // c = kC2 * in4 - kC1 * in12
108af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // d = kC1 * in4 + kC2 * in12
109af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
110af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
111af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
112af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
113af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
114af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
115af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
116af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  Transpose8x2(E0, E1, rows);
117af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
118af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
119af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void ITransformOne(const uint8_t* ref,
120af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                          const int16_t* in, uint8_t* dst) {
121af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int16x8x2_t rows;
122af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
123af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  TransformPass(&rows);
124af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  TransformPass(&rows);
125af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  Add4x4(rows.val[0], rows.val[1], ref, dst);
126af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
127af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
128af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#else
129af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
1301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransformOne(const uint8_t* ref,
1311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                          const int16_t* in, uint8_t* dst) {
1321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int kBPS = BPS;
133af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
1341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  __asm__ volatile (
1361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16         {q1, q2}, [%[in]]           \n"
1371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16         {d0}, [%[kC1C2]]            \n"
1381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2: in[0]
1401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3: in[8]
1411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4: in[4]
1421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5: in[12]
1431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d4                      \n"
1441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
1461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q9 = {in[4], in[12]} * kC2 >> 16
1471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d22 = a = in[0] + in[8]
1511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d23 = b = in[0] - in[8]
1521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
1531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
1541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    //  q8 = in[4]/[12] * kC1 >> 16
1561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
1571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Add {in[4], in[12]} back after the multiplication.
1591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
1601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d20 = c = in[4]*kC2 - in[12]*kC1
1621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d21 = d = in[4]*kC1 + in[12]*kC2
1631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
1641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
1651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2 = tmp[0] = a + d
1671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3 = tmp[1] = b + c
1681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4 = tmp[2] = b - c
1691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5 = tmp[3] = a - d
1701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
1711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
1721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
1731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
1741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
1761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
1771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d4                      \n"
1791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
1821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d22 = a = tmp[0] + tmp[8]
1861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d23 = b = tmp[0] - tmp[8]
1871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
1881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
1891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
1911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
1921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d20 = c = in[4]*kC2 - in[12]*kC1
1941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d21 = d = in[4]*kC1 + in[12]*kC2
1951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
1961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
1971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2 = tmp[0] = a + d
1991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3 = tmp[1] = b + c
2001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4 = tmp[2] = b - c
2011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5 = tmp[3] = a - d
2021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
2031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
2041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
2051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
2061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
2081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
2091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
2101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
2111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
2131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // (val) + 4 >> 3
2151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d2, d2, #3                  \n"
2161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d3, d3, #3                  \n"
2171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d4, d4, #3                  \n"
2181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d5, d5, #3                  \n"
2191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
2211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
2221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Must accumulate before saturating
2241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u8        q8, d6                      \n"
2251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u8        q9, d7                      \n"
2261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q1, q1, q8                  \n"
2281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q2, q2, q9                  \n"
2291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqmovun.s16     d0, q1                      \n"
2311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqmovun.s16     d1, q2                      \n"
2321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
2341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
2351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
2361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d1[1], [%[dst]]             \n"
2371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
2391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
2401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
2411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  );
2421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
244af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif    // USE_INTRINSICS
245af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
2461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransform(const uint8_t* ref,
2471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                       const int16_t* in, uint8_t* dst, int do_two) {
2481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  ITransformOne(ref, in, dst);
2491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  if (do_two) {
2501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    ITransformOne(ref + 4, in + 16, dst + 4);
2511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  }
2521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
254af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Load all 4x4 pixels into a single uint8x16_t variable.
255af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic uint8x16_t Load4x4(const uint8_t* src) {
256af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint32x4_t out = { 0, 0, 0, 0 };
257af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
258af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
259af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
260af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
261af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return vreinterpretq_u8_u32(out);
2621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Forward transform.
2651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
266af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(USE_INTRINSICS)
267af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
268af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
269af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                         const int16x4_t C, const int16x4_t D,
270af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                         int16x8_t* const out01,
271af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                         int16x8_t* const out32) {
272af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x4x2_t AB = vtrn_s16(A, B);
273af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x4x2_t CD = vtrn_s16(C, D);
274af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
275af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                     vreinterpret_s32_s16(CD.val[0]));
276af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
277af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                     vreinterpret_s32_s16(CD.val[1]));
278af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  *out01 = vreinterpretq_s16_s64(
279af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
280af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                   vreinterpret_s64_s32(tmp13.val[0])));
281af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  *out32 = vreinterpretq_s16_s64(
282af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
283af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                   vreinterpret_s64_s32(tmp02.val[1])));
284af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
285af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
286af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
287af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                         const uint8x8_t b) {
288af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return vreinterpretq_s16_u16(vsubl_u8(a, b));
289af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
290af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
291af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref,
292af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                       int16_t* out) {
293af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
294af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  {
295af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint8x16_t S0 = Load4x4(src);
296af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint8x16_t R0 = Load4x4(ref);
297af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
298af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
299af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t D0 = vget_low_s16(D0D1);
300af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t D1 = vget_high_s16(D0D1);
301af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t D2 = vget_low_s16(D2D3);
302af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t D3 = vget_high_s16(D2D3);
303af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
304af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
305af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  {    // 1rst pass
306af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t kCst937 = vdupq_n_s32(937);
307af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t kCst1812 = vdupq_n_s32(1812);
308af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
309af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
310af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
311af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
312af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                    vget_high_s16(a0a1_2));
313af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
314af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                    vget_high_s16(a0a1_2));
315af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
316af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
317af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
318af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
319af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
320af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
321af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
322af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
323af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  {    // 2nd pass
324af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
325af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
326af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t kCst51000 = vdupq_n_s32(51000);
327af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
328af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
329af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
330af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
331af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
332af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
333af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
334af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
335af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
336af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
337af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
338af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t a3_eq_0 =
339af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
340af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
341af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    vst1_s16(out +  0, out0);
342af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    vst1_s16(out +  4, out1);
343af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    vst1_s16(out +  8, out2);
344af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    vst1_s16(out + 12, out3);
345af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
346af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
347af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
348af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#else
349af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
3501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
3511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int16_t kCoeff16[] = {
3521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
3531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora};
3541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int32_t kCoeff32[] = {
3551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora   1812,  1812,  1812,  1812,
3561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    937,   937,   937,   937,
3571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  12000, 12000, 12000, 12000,
3581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  51000, 51000, 51000, 51000
3591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora};
3601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref,
3621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                       int16_t* out) {
3631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int kBPS = BPS;
3641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* src_ptr = src;
3651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* ref_ptr = ref;
3661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int16_t* coeff16 = kCoeff16;
3671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int32_t* coeff32 = kCoeff32;
3681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  __asm__ volatile (
3701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load src into q4, q5 in high half
3711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
3721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
3731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
3741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d11}, [%[src_ptr]]               \n"
3751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load ref into q6, q7 in high half
3771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
3781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
3791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
3801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d15}, [%[ref_ptr]]               \n"
3811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Pack the high values in to q4 and q6
3831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32     q4, q5                       \n"
3841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32     q6, q7                       \n"
3851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d[0-3] = src - ref
3871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8    q0, d8, d12                  \n"
3881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8    q1, d9, d13                  \n"
3891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff16 into q8(d16=5352, d17=2217)
3911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16     {q8}, [%[coeff16]]           \n"
3921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff32 high half into q9 = 1812, q10 = 937
3941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
3951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff32 low half into q11=12000, q12=51000
3971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
3981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // part 1
4001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Transpose. Register dN is the same as dN in C
4011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d0, d2                   \n"
4021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d1, d3                   \n"
4031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d0, d1                   \n"
4041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d2, d3                   \n"
4051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
4071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
4081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
4091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
4101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d0, d4, d5               \n" // a0 + a1
4121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
4131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d2, d4, d5               \n" // a0 - a1
4141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
4151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
4171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
4181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
4191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
4201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
4221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
4231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d1, q9, #9               \n"
4241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d3, q10, #9              \n"
4251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // part 2
4271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
4281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d0, d2                   \n"
4291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d1, d3                   \n"
4301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d0, d1                   \n"
4311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d2, d3                   \n"
4321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmov.s16        d26, #7                  \n"
4341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
4361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
4371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
4381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d4, d26              \n" // a1 + 7
4391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
4401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
4421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
4431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
4451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
4461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vceq.s16        d4, d7, #0               \n"
4481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        d0, d0, #4               \n"
4501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        d2, d2, #4               \n"
4511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
4531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
4541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4550406ce1417f76f2034833414dcecc9f56253640cVikas Arora    "vmvn            d4, d4                   \n" // !(d1 == 0)
4561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
4571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d1, q11, #16             \n"
4581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[4] += (d1!=0)
4591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d1, d1, d4               \n"
4601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
4611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d3, q12, #16             \n"
4621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // set result to out array
4641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.16         {q0, q1}, [%[out]]   \n"
4651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
4661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      [coeff32] "+r"(coeff32)          // modified registers
4671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
4681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      [out] "r"(out)                   // constants
4691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
4701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      "q10", "q11", "q12", "q13"       // clobbered
4711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  );
4721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
4731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
474af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif
475af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
476af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define LOAD_LANE_16b(VALUE, LANE) do {             \
477af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
478af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  src += stride;                                    \
479af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} while (0)
480af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
481af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void FTransformWHT(const int16_t* src, int16_t* out) {
482af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int stride = 16;
483af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x4_t zero = vdup_n_s16(0);
484af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int32x4x4_t tmp0;
485af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int16x4x4_t in;
486af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  INIT_VECTOR4(in, zero, zero, zero, zero);
487af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[0], 0);
488af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[1], 0);
489af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[2], 0);
490af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[3], 0);
491af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[0], 1);
492af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[1], 1);
493af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[2], 1);
494af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[3], 1);
495af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[0], 2);
496af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[1], 2);
497af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[2], 2);
498af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[3], 2);
499af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[0], 3);
500af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[1], 3);
501af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[2], 3);
502af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_16b(in.val[3], 3);
503af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
504af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  {
505af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a0 = in[0 * 16] + in[2 * 16]
506af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a1 = in[1 * 16] + in[3 * 16]
507af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a2 = in[1 * 16] - in[3 * 16]
508af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a3 = in[0 * 16] - in[2 * 16]
509af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
510af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
511af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
512af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
513af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    tmp0.val[0] = vaddq_s32(a0, a1);
514af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    tmp0.val[1] = vaddq_s32(a3, a2);
515af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    tmp0.val[2] = vsubq_s32(a3, a2);
516af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    tmp0.val[3] = vsubq_s32(a0, a1);
517af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
518af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  {
519af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
520af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a0 = tmp[0 + i] + tmp[ 8 + i]
521af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a1 = tmp[4 + i] + tmp[12 + i]
522af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a2 = tmp[4 + i] - tmp[12 + i]
523af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a3 = tmp[0 + i] - tmp[ 8 + i]
524af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
525af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
526af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
527af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
528af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
529af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
530af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
531af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
532af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t out0 = vmovn_s32(b0);
533af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t out1 = vmovn_s32(b1);
534af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t out2 = vmovn_s32(b2);
535af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int16x4_t out3 = vmovn_s32(b3);
536af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
537af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    vst1_s16(out +  0, out0);
538af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    vst1_s16(out +  4, out1);
539af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    vst1_s16(out +  8, out2);
540af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    vst1_s16(out + 12, out3);
541af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
5421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
543af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef LOAD_LANE_16b
5441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
5451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
5461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Texture distortion
5471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
5481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// We try to match the spectral content (weighted) between source and
5491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// reconstructed samples.
5501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
551af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// This code works but is *slower* than the inlined-asm version below
552af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
553af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// USE_INTRINSICS define.
554af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// With gcc-4.8, it's only slightly slower than the inlined.
555af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(USE_INTRINSICS)
556af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
557af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Zero extend an uint16x4_t 'v' to an int32x4_t.
558af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
559af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return vreinterpretq_s32_u32(vmovl_u16(v));
560af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
561af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
562af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Does a regular 4x4 transpose followed by an adjustment of the upper columns
563af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// in the inner rows to restore the source order of differences,
564af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// i.e., a0 - a1 | a3 - a2.
565af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {
566af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int32x4x4_t out = Transpose4x4(rows);
567af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // restore source order in the columns containing differences.
568af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x2_t r1h = vget_high_s32(out.val[1]);
569af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x2_t r2h = vget_high_s32(out.val[2]);
570af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);
571af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);
572af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return out;
573af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
574af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
575af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,
576af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                                   const uint8x8_t r2r3) {
577af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a0 = in[0] + in[2] | a1 = in[1] + in[3]
578af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);
579af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a3 = in[0] - in[2] | a2 = in[1] - in[3]
580af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);
581af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1));  // a0 + a1
582af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2));  // a3 + a2
583af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.
584af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3
585af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2
586af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8x2_t transpose =
587af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));
588af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // tmp[3] = a0 - a1 | tmp[2] = a3 - a2
589af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),
590af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                      vget_low_s16(transpose.val[1]));
591af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),
592af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                      vget_high_s16(transpose.val[1]));
593af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // [0]: tmp[3] [1]: tmp[2]
594af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);
595af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
596af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return res;
597af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
598af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
599af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {
600af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a0 = tmp[0 + i] + tmp[8 + i];
601af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);
602af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a1 = tmp[4 + i] + tmp[12+ i];
603af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);
604af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a2 = tmp[4 + i] - tmp[12+ i];
605af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);
606af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // a3 = tmp[0 + i] - tmp[8 + i];
607af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);
608af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1));  // abs(a0 + a1)
609af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2));  // abs(a3 + a2)
610af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t b2 = vabdq_s32(a3, a2);              // abs(a3 - a2)
611af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t b3 = vabdq_s32(a0, a1);              // abs(a0 - a1)
612af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4x4_t res = { { b0, b1, b2, b3 } };
613af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return res;
614af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
615af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
616af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Calculate the weighted sum of the rows in 'b'.
617af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,
618af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                      const int32x4_t w0, const int32x4_t w1,
619af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                      const int32x4_t w2, const int32x4_t w3) {
620af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t s0 = vmulq_s32(w0, b.val[0]);
621af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);
622af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);
623af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);
624af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int64x2_t sum1 = vpaddlq_s32(s3);
625af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));
626af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return sum2;
627af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
628af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
629af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define LOAD_LANE_32b(src, VALUE, LANE) \
630af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
631af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
632af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Hadamard transform
633af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients.
634af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b,
635af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                    const uint16_t* const w) {
636af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint32x4_t d0d1 = { 0, 0, 0, 0 };
637af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint32x4_t d2d3 = { 0, 0, 0, 0 };
638af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_32b(a + 0 * BPS, d0d1, 0);  // a00 a01 a02 a03
639af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_32b(a + 1 * BPS, d0d1, 1);  // a10 a11 a12 a13
640af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_32b(b + 0 * BPS, d0d1, 2);  // b00 b01 b02 b03
641af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_32b(b + 1 * BPS, d0d1, 3);  // b10 b11 b12 b13
642af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_32b(a + 2 * BPS, d2d3, 0);  // a20 a21 a22 a23
643af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_32b(a + 3 * BPS, d2d3, 1);  // a30 a31 a32 a33
644af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_32b(b + 2 * BPS, d2d3, 2);  // b20 b21 b22 b23
645af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  LOAD_LANE_32b(b + 3 * BPS, d2d3, 3);  // b30 b31 b32 b33
646af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
647af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  {
648af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31
649af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33
650af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint16x8x2_t tmp =
651af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora        vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));
652af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);
653af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);
654af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),
655af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                                    vget_low_u8(d2d3u8));
656af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),
657af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                                    vget_high_u8(d2d3u8));
658af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);
659af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);
660af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);
661af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
662af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
663af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
664af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
665af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
666af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
667af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
668af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
669af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                    vreinterpret_s32_s64(sum2));
670af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const int32x2_t res = vshr_n_s32(diff, 5);
671af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    return vget_lane_s32(res, 0);
672af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
673af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
674af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
675af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef LOAD_LANE_32b
676af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
677af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#else
678af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
6791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Hadamard transform
6801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients.
6811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b,
6821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                    const uint16_t* const w) {
6831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int kBPS = BPS;
6841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* A = a;
6851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* B = b;
6861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint16_t* W = w;
6871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  int sum;
6881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  __asm__ volatile (
6891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
6901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
6911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
6921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d2[1], [%[a]]            \n"
6931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
6941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
6951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
6961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
6971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d3[1], [%[b]]            \n"
6981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
6991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a d0/d2, b d1/d3
7001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d0/d1: 01 01 01 01
7011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2/d3: 23 23 23 23
7021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // But: it goes 01 45 23 67
7031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Notice the middle values are transposed
7041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         q0, q1                   \n"
7051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
7071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vaddl.u8        q2, d0, d2               \n"
7081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vaddl.u8        q10, d1, d3              \n"
7091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
7101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8        q3, d0, d2               \n"
7111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8        q11, d1, d3              \n"
7121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // tmp[0] = a0 + a1
7141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.s16      q0, q2                   \n"
7151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.s16      q8, q10                  \n"
7161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // tmp[1] = a3 + a2
7181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.s16      q1, q3                   \n"
7191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.s16      q9, q11                  \n"
7201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // No pair subtract
7221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q2 = {a0, a3}
7231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q3 = {a1, a2}
7241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         q2, q3                   \n"
7251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         q10, q11                 \n"
7261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
7281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.s16       q12, d4, d6              \n"
7291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.s16       q13, d5, d7              \n"
7301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.s16       q14, d20, d22            \n"
7311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.s16       q15, d21, d23            \n"
7321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // separate tmp[3] and tmp[2]
7341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q12 = tmp[3]
7351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q13 = tmp[2]
7361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q12, q13                 \n"
7371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q14, q15                 \n"
7381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Transpose tmp for a
7401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d1, d26                  \n" // vtrn.64
7411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d24                  \n" // vtrn.64
7421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q0, q1                   \n"
7431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q13, q12                 \n"
7441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Transpose tmp for b
7461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d17, d30                 \n" // vtrn.64
7471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d19, d28                 \n" // vtrn.64
7481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q8, q9                   \n"
7491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q15, q14                 \n"
7501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // The first Q register is a, the second b.
7521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q0/8 tmp[0-3]
7531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q13/15 tmp[4-7]
7541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q1/9 tmp[8-11]
7551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q12/14 tmp[12-15]
7561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // These are still in 01 45 23 67 order. We fix it easily in the addition
7588b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    // case but the subtraction propagates them.
7591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d27                  \n"
7601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d19, d31                 \n"
7611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a0 = tmp[0] + tmp[8]
7631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q2, q0, q1               \n"
7641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q3, q8, q9               \n"
7651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a1 = tmp[4] + tmp[12]
7671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q10, q13, q12            \n"
7681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q11, q15, q14            \n"
7691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a2 = tmp[4] - tmp[12]
7711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q13, q13, q12            \n"
7721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q15, q15, q14            \n"
7731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a3 = tmp[0] - tmp[8]
7751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q0, q0, q1               \n"
7761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q8, q8, q9               \n"
7771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // b0 = a0 + a1
7791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q1, q2, q10              \n"
7801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q9, q3, q11              \n"
7811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // b1 = a3 + a2
7831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q12, q0, q13             \n"
7841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q14, q8, q15             \n"
7851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // b2 = a3 - a2
7871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q0, q0, q13              \n"
7881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q8, q8, q15              \n"
7891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // b3 = a0 - a1
7911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q2, q2, q10              \n"
7921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q3, q3, q11              \n"
7931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.64         {q10, q11}, [%[w]]       \n"
7951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(b0)
7971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q1, q1                   \n"
7981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q9, q9                   \n"
7991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(b1)
8001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q12, q12                 \n"
8011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q14, q14                 \n"
8021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(b2)
8031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q0, q0                   \n"
8041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q8, q8                   \n"
8051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(b3)
8061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q2, q2                   \n"
8071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q3, q3                   \n"
8081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // expand w before using.
8101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u16       q13, d20                 \n"
8111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u16       q15, d21                 \n"
8121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // w[0] * abs(b0)
8141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmul.u32        q1, q1, q13              \n"
8151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmul.u32        q9, q9, q13              \n"
8161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // w[4] * abs(b1)
8181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q1, q12, q15             \n"
8191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q9, q14, q15             \n"
8201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // expand w before using.
8221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u16       q13, d22                 \n"
8231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u16       q15, d23                 \n"
8241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // w[8] * abs(b1)
8261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q1, q0, q13              \n"
8271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q9, q8, q13              \n"
8281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // w[12] * abs(b1)
8301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q1, q2, q15              \n"
8311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q9, q3, q15              \n"
8321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Sum the arrays
8341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.u32      q1, q1                   \n"
8351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.u32      q9, q9                   \n"
8361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.u64        d2, d3                   \n"
8371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.u64        d18, d19                 \n"
8381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Hadamard transform needs 4 bits of extra precision (2 bits in each
8401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
8411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
8421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // 16bits for w[] + 2 bits of abs() summation.
8431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    //
8441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
8451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // A-OK.
8461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // sum2 - sum1
8481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.u32        d0, d2, d18              \n"
8491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(sum2 - sum1)
8501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        d0, d0                   \n"
8511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(sum2 - sum1) >> 5
8521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.u32        d0, #5                   \n"
8531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // It would be better to move the value straight into r0 but I'm not
8551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // entirely sure how this works with inline assembly.
8561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmov.32         %[sum], d0[0]            \n"
8571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
8591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [kBPS] "r"(kBPS)
8601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
8611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
8621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  ) ;
8631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  return sum;
8651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
8661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
867af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif  // USE_INTRINSICS
868af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
8691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto16x16(const uint8_t* const a, const uint8_t* const b,
8701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                      const uint16_t* const w) {
8711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  int D = 0;
8721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  int x, y;
8731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
8741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    for (x = 0; x < 16; x += 4) {
8751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      D += Disto4x4(a + x + y, b + x + y, w);
8761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    }
8771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  }
8781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  return D;
8791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
8801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
881af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------
882af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
883af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
884af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                             int start_block, int end_block,
885af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                             VP8Histogram* const histo) {
886af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
887af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int j;
888af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (j = start_block; j < end_block; ++j) {
889af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    int16_t out[16];
890af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
891af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    {
892af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      int k;
893af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const int16x8_t a0 = vld1q_s16(out + 0);
894af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const int16x8_t b0 = vld1q_s16(out + 8);
895af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
896af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
897af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
898af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
899af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
900af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
901af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
902af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
903af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      // Convert coefficients to bin.
904af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      for (k = 0; k < 16; ++k) {
905af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora        histo->distribution[out[k]]++;
906af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora      }
907af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    }
908af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
909af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
910af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
911af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------
912af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
913af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
914af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                        const uint8_t* const b,
915af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                        uint32x4_t* const sum) {
916af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint8x16_t a0 = vld1q_u8(a);
917af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint8x16_t b0 = vld1q_u8(b);
918af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
919af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
920af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
921af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
922af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
923af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
924af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Horizontal sum of all four uint32_t values in 'sum'.
925af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SumToInt(uint32x4_t sum) {
926af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint64x2_t sum2 = vpaddlq_u32(sum);
927af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
928af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return (int)sum3;
929af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
930af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
931af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE16x16(const uint8_t* a, const uint8_t* b) {
932af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint32x4_t sum = { 0, 0, 0, 0 };
933af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int y;
934af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (y = 0; y < 16; ++y) {
935af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
936af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
937af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return SumToInt(sum);
938af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
939af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
940af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE16x8(const uint8_t* a, const uint8_t* b) {
941af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint32x4_t sum = { 0, 0, 0, 0 };
942af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int y;
943af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (y = 0; y < 8; ++y) {
944af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
945af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
946af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return SumToInt(sum);
947af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
948af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
949af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE8x8(const uint8_t* a, const uint8_t* b) {
950af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint32x4_t sum = { 0, 0, 0, 0 };
951af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  int y;
952af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  for (y = 0; y < 8; ++y) {
953af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint8x8_t a0 = vld1_u8(a + y * BPS);
954af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint8x8_t b0 = vld1_u8(b + y * BPS);
955af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint8x8_t abs_diff = vabd_u8(a0, b0);
956af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
957af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora    sum = vpadalq_u16(sum, prod);
958af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  }
959af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return SumToInt(sum);
960af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
961af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
962af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE4x4(const uint8_t* a, const uint8_t* b) {
963af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint8x16_t a0 = Load4x4(a);
964af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint8x16_t b0 = Load4x4(b);
965af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
966af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
967af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
968af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return SumToInt(vpaddlq_u16(prod));
969af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
970af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
971af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------
972af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
973af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable
974af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// in iOS/arm64 builds. Disable this function in those cases.
975af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
976af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
977af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int16x8_t Quantize(int16_t* const in,
978af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                          const VP8Matrix* const mtx, int offset) {
979af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
980af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
981af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
982af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
983af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
984af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
985af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t a = vld1q_s16(in + offset);                // in
986af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
987af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
988af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
989af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
990af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
991af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
992af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
993af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
994af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
995af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
996af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
997af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
998af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
999af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1q_s16(in + offset, c4);
1000af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
1001af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return c3;
1002af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
1003af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
1004af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic const uint8_t kShuffles[4][8] = {
1005af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 0,   1,  2,  3,  8,  9, 16, 17 },
1006af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 10, 11,  4,  5,  6,  7, 12, 13 },
1007af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 18, 19, 24, 25, 26, 27, 20, 21 },
1008af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 14, 15, 22, 23, 28, 29, 30, 31 }
1009af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora};
1010af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
1011af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int QuantizeBlock(int16_t in[16], int16_t out[16],
1012af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora                         const VP8Matrix* const mtx) {
1013af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t out0 = Quantize(in, mtx, 0);
1014af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  const int16x8_t out1 = Quantize(in, mtx, 8);
1015af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  uint8x8x4_t all_out;
1016af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  INIT_VECTOR4(all_out,
1017af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora               vreinterpret_u8_s16(vget_low_s16(out0)),
1018af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora               vreinterpret_u8_s16(vget_high_s16(out0)),
1019af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora               vreinterpret_u8_s16(vget_low_s16(out1)),
1020af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora               vreinterpret_u8_s16(vget_high_s16(out1)));
1021af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // Zigzag reordering
1022af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1_u8((uint8_t*)(out +  0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
1023af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1_u8((uint8_t*)(out +  4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
1024af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1_u8((uint8_t*)(out +  8), vtbl4_u8(all_out, vld1_u8(kShuffles[2])));
1025af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
1026af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  // test zeros
1027af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (*(uint64_t*)(out +  0) != 0) return 1;
1028af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (*(uint64_t*)(out +  4) != 0) return 1;
1029af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (*(uint64_t*)(out +  8) != 0) return 1;
1030af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  if (*(uint64_t*)(out + 12) != 0) return 1;
1031af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  return 0;
1032af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}
1033af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
1034af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif   // !WORK_AROUND_GCC && !__aarch64__
1035af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora
10361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif   // WEBP_USE_NEON
10371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
10391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Entry point
10401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10411e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroraextern void VP8EncDspInitNEON(void);
10421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10431e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroravoid VP8EncDspInitNEON(void) {
10441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON)
10451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8ITransform = ITransform;
10461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8FTransform = FTransform;
10471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8FTransformWHT = FTransformWHT;
10491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8TDisto4x4 = Disto4x4;
10511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8TDisto16x16 = Disto16x16;
1052af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8CollectHistogram = CollectHistogram;
1053af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8SSE16x16 = SSE16x16;
1054af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8SSE16x8 = SSE16x8;
1055af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8SSE8x8 = SSE8x8;
1056af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8SSE4x4 = SSE4x4;
1057af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
1058af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora  VP8EncQuantizeBlock = QuantizeBlock;
1059af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif
10601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif   // WEBP_USE_NEON
10611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
1062