11e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Copyright 2012 Google Inc. All Rights Reserved.
21e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
30406ce1417f76f2034833414dcecc9f56253640cVikas Arora// Use of this source code is governed by a BSD-style license
40406ce1417f76f2034833414dcecc9f56253640cVikas Arora// that can be found in the COPYING file in the root of the source
50406ce1417f76f2034833414dcecc9f56253640cVikas Arora// tree. An additional intellectual property rights grant can be found
60406ce1417f76f2034833414dcecc9f56253640cVikas Arora// in the file PATENTS. All contributing project authors may
70406ce1417f76f2034833414dcecc9f56253640cVikas Arora// be found in the AUTHORS file in the root of the source tree.
81e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// -----------------------------------------------------------------------------
91e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// ARM NEON version of speed-critical encoding functions.
111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from libvpx (http://www.webmproject.org/code/)
131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "./dsp.h"
151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON)
171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include <assert.h>
1933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
2033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./neon.h"
211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "../enc/vp8enci.h"
221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Transforms (Paragraph 14.4)
251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Inverse transform.
2733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This code is pretty much the same as TransformOne in the dec_neon.c, except
281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// for subtraction to *ref. See the comments there for algorithmic explanations.
2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const int16_t kC1 = 20091;
3133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
3233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This code works but is *slower* than the inlined-asm version below
3433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
3533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// USE_INTRINSICS define.
3633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// With gcc-4.8, it's a little faster speed than inlined-assembly.
3733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(USE_INTRINSICS)
3833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
4033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
4133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
4233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
4333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
4433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
4533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// to the corresponding rows of 'dst'.
4633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
4733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                            const int16x8_t dst01,
4833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                            const int16x8_t dst23) {
4933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Unsigned saturate to 8b.
5033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
5133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
5233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
5333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Store the results.
5433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
5533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
5633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
5733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
5833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
5933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
6033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
6133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                               const uint8_t* const ref, uint8_t* const dst) {
6233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x2_t dst01 = vdup_n_u32(0);
6333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x2_t dst23 = vdup_n_u32(0);
6433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
6533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Load the source pixels.
6633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
6733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
6833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
6933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
7033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
7133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
7233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Convert to 16b.
7333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
7433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
7533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
7633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Descale with rounding.
7733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
7833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
7933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Add the inverse transform.
8033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    SaturateAndStore4x4(dst, out01, out23);
8133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
8233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
8333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
8533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     int16x8x2_t* const out) {
8633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
8733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
8833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
8933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                                  // b0 d0 b1 d1 b2 d2 ...
9033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
9133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
9233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
9333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
9433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // {rows} = in0 | in4
9533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  //          in8 | in12
9633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // B1 = in4 | in12
9733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t B1 =
9833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
9933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // C0 = kC1 * in4 | kC1 * in12
10033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // C1 = kC2 * in4 | kC2 * in12
10133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
10233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
10333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
10433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                vget_low_s16(rows->val[1]));   // in0 + in8
10533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
10633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                vget_low_s16(rows->val[1]));   // in0 - in8
10733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // c = kC2 * in4 - kC1 * in12
10833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // d = kC1 * in4 + kC2 * in12
10933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
11033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
11133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
11233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
11333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
11433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
11533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
11633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Transpose8x2(E0, E1, rows);
11733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
11833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
11933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ITransformOne(const uint8_t* ref,
12033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                          const int16_t* in, uint8_t* dst) {
12133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int16x8x2_t rows;
12233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
12333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  TransformPass(&rows);
12433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  TransformPass(&rows);
12533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Add4x4(rows.val[0], rows.val[1], ref, dst);
12633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
12733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
12833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else
12933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
1301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransformOne(const uint8_t* ref,
1311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                          const int16_t* in, uint8_t* dst) {
1321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int kBPS = BPS;
13333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
1341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  __asm__ volatile (
1361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16         {q1, q2}, [%[in]]           \n"
1371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16         {d0}, [%[kC1C2]]            \n"
1381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2: in[0]
1401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3: in[8]
1411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4: in[4]
1421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5: in[12]
1431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d4                      \n"
1441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
1461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q9 = {in[4], in[12]} * kC2 >> 16
1471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d22 = a = in[0] + in[8]
1511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d23 = b = in[0] - in[8]
1521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
1531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
1541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    //  q8 = in[4]/[12] * kC1 >> 16
1561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
1571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Add {in[4], in[12]} back after the multiplication.
1591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
1601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d20 = c = in[4]*kC2 - in[12]*kC1
1621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d21 = d = in[4]*kC1 + in[12]*kC2
1631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
1641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
1651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2 = tmp[0] = a + d
1671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3 = tmp[1] = b + c
1681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4 = tmp[2] = b - c
1691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5 = tmp[3] = a - d
1701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
1711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
1721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
1731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
1741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
1761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
1771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d4                      \n"
1791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
1821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d22 = a = tmp[0] + tmp[8]
1861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d23 = b = tmp[0] - tmp[8]
1871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
1881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
1891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
1911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
1921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d20 = c = in[4]*kC2 - in[12]*kC1
1941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d21 = d = in[4]*kC1 + in[12]*kC2
1951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
1961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
1971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2 = tmp[0] = a + d
1991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3 = tmp[1] = b + c
2001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4 = tmp[2] = b - c
2011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5 = tmp[3] = a - d
2021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
2031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
2041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
2051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
2061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
2081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
2091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
2101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
2111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
2131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // (val) + 4 >> 3
2151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d2, d2, #3                  \n"
2161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d3, d3, #3                  \n"
2171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d4, d4, #3                  \n"
2181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d5, d5, #3                  \n"
2191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
2211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
2221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Must accumulate before saturating
2241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u8        q8, d6                      \n"
2251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u8        q9, d7                      \n"
2261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q1, q1, q8                  \n"
2281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q2, q2, q9                  \n"
2291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqmovun.s16     d0, q1                      \n"
2311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqmovun.s16     d1, q2                      \n"
2321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
2341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
2351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
2361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d1[1], [%[dst]]             \n"
2371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
2391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
2401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
2411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  );
2421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
24433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif    // USE_INTRINSICS
24533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
2461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransform(const uint8_t* ref,
2471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                       const int16_t* in, uint8_t* dst, int do_two) {
2481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  ITransformOne(ref, in, dst);
2491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  if (do_two) {
2501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    ITransformOne(ref + 4, in + 16, dst + 4);
2511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  }
2521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
25433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Load all 4x4 pixels into a single uint8x16_t variable.
25533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic uint8x16_t Load4x4(const uint8_t* src) {
2568c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32x4_t out = vdupq_n_u32(0);
25733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
25833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
25933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
26033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
26133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_u8_u32(out);
2621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Forward transform.
2651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
26633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(USE_INTRINSICS)
26733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
26833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
26933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         const int16x4_t C, const int16x4_t D,
27033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         int16x8_t* const out01,
27133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         int16x8_t* const out32) {
27233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4x2_t AB = vtrn_s16(A, B);
27333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4x2_t CD = vtrn_s16(C, D);
27433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
27533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     vreinterpret_s32_s16(CD.val[0]));
27633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
27733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     vreinterpret_s32_s16(CD.val[1]));
27833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *out01 = vreinterpretq_s16_s64(
27933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
28033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                   vreinterpret_s64_s32(tmp13.val[0])));
28133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *out32 = vreinterpretq_s16_s64(
28233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
28333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                   vreinterpret_s64_s32(tmp02.val[1])));
28433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
28533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
28633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
28733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         const uint8x8_t b) {
28833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_s16_u16(vsubl_u8(a, b));
28933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
29033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
29133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref,
29233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                       int16_t* out) {
29333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
29433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
29533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t S0 = Load4x4(src);
29633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t R0 = Load4x4(ref);
29733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
29833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
29933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t D0 = vget_low_s16(D0D1);
30033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t D1 = vget_high_s16(D0D1);
30133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t D2 = vget_low_s16(D2D3);
30233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t D3 = vget_high_s16(D2D3);
30333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
30433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
30533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {    // 1rst pass
30633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst937 = vdupq_n_s32(937);
30733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst1812 = vdupq_n_s32(1812);
30833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
30933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
31033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
31133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
31233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                    vget_high_s16(a0a1_2));
31333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
31433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                    vget_high_s16(a0a1_2));
31533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
31633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
31733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
31833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
31933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
32033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
32133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
32233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
32333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {    // 2nd pass
32433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
32533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
32633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst51000 = vdupq_n_s32(51000);
32733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
32833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
32933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
33033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
33133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
33233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
33333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
33433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
33533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
33633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
33733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
33833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t a3_eq_0 =
33933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
34033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
34133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  0, out0);
34233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  4, out1);
34333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  8, out2);
34433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out + 12, out3);
34533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
34633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
34733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
34833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else
34933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
3511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int16_t kCoeff16[] = {
3521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
3531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora};
3541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int32_t kCoeff32[] = {
3551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora   1812,  1812,  1812,  1812,
3561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    937,   937,   937,   937,
3571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  12000, 12000, 12000, 12000,
3581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  51000, 51000, 51000, 51000
3591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora};
3601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref,
3621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                       int16_t* out) {
3631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int kBPS = BPS;
3641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* src_ptr = src;
3651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* ref_ptr = ref;
3661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int16_t* coeff16 = kCoeff16;
3671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int32_t* coeff32 = kCoeff32;
3681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  __asm__ volatile (
3701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load src into q4, q5 in high half
3711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
3721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
3731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
3741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d11}, [%[src_ptr]]               \n"
3751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load ref into q6, q7 in high half
3771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
3781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
3791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
3801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d15}, [%[ref_ptr]]               \n"
3811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Pack the high values in to q4 and q6
3831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32     q4, q5                       \n"
3841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32     q6, q7                       \n"
3851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d[0-3] = src - ref
3871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8    q0, d8, d12                  \n"
3881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8    q1, d9, d13                  \n"
3891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff16 into q8(d16=5352, d17=2217)
3911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16     {q8}, [%[coeff16]]           \n"
3921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff32 high half into q9 = 1812, q10 = 937
3941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
3951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff32 low half into q11=12000, q12=51000
3971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
3981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // part 1
4001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Transpose. Register dN is the same as dN in C
4011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d0, d2                   \n"
4021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d1, d3                   \n"
4031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d0, d1                   \n"
4041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d2, d3                   \n"
4051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
4071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
4081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
4091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
4101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d0, d4, d5               \n" // a0 + a1
4121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
4131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d2, d4, d5               \n" // a0 - a1
4141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
4151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
4171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
4181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
4191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
4201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
4221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
4231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d1, q9, #9               \n"
4241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d3, q10, #9              \n"
4251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // part 2
4271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
4281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d0, d2                   \n"
4291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d1, d3                   \n"
4301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d0, d1                   \n"
4311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d2, d3                   \n"
4321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmov.s16        d26, #7                  \n"
4341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
4361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
4371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
4381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d4, d26              \n" // a1 + 7
4391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
4401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
4421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
4431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
4451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
4461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vceq.s16        d4, d7, #0               \n"
4481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        d0, d0, #4               \n"
4501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        d2, d2, #4               \n"
4511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
4531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
4541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4550406ce1417f76f2034833414dcecc9f56253640cVikas Arora    "vmvn            d4, d4                   \n" // !(d1 == 0)
4561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
4571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d1, q11, #16             \n"
4581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[4] += (d1!=0)
4591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d1, d1, d4               \n"
4601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
4611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d3, q12, #16             \n"
4621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // set result to out array
4641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.16         {q0, q1}, [%[out]]   \n"
4651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
4661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      [coeff32] "+r"(coeff32)          // modified registers
4671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
4681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      [out] "r"(out)                   // constants
4691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
4701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      "q10", "q11", "q12", "q13"       // clobbered
4711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  );
4721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
4731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
47433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif
47533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
47633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOAD_LANE_16b(VALUE, LANE) do {             \
47733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
47833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  src += stride;                                    \
47933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} while (0)
48033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
48133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void FTransformWHT(const int16_t* src, int16_t* out) {
48233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int stride = 16;
48333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t zero = vdup_n_s16(0);
48433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int32x4x4_t tmp0;
48533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int16x4x4_t in;
48633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(in, zero, zero, zero, zero);
48733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[0], 0);
48833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[1], 0);
48933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[2], 0);
49033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[3], 0);
49133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[0], 1);
49233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[1], 1);
49333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[2], 1);
49433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[3], 1);
49533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[0], 2);
49633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[1], 2);
49733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[2], 2);
49833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[3], 2);
49933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[0], 3);
50033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[1], 3);
50133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[2], 3);
50233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[3], 3);
50333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
50433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
50533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a0 = in[0 * 16] + in[2 * 16]
50633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a1 = in[1 * 16] + in[3 * 16]
50733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a2 = in[1 * 16] - in[3 * 16]
50833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a3 = in[0 * 16] - in[2 * 16]
50933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
51033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
51133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
51233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
51333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp0.val[0] = vaddq_s32(a0, a1);
51433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp0.val[1] = vaddq_s32(a3, a2);
51533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp0.val[2] = vsubq_s32(a3, a2);
51633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp0.val[3] = vsubq_s32(a0, a1);
51733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
51833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
51933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
52033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a0 = tmp[0 + i] + tmp[ 8 + i]
52133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a1 = tmp[4 + i] + tmp[12 + i]
52233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a2 = tmp[4 + i] - tmp[12 + i]
52333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a3 = tmp[0 + i] - tmp[ 8 + i]
52433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
52533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
52633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
52733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
52833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
52933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
53033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
53133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
53233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out0 = vmovn_s32(b0);
53333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out1 = vmovn_s32(b1);
53433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out2 = vmovn_s32(b2);
53533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out3 = vmovn_s32(b3);
53633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
53733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  0, out0);
53833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  4, out1);
53933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  8, out2);
54033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out + 12, out3);
54133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
5421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
54333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef LOAD_LANE_16b
5441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
5451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
5461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Texture distortion
5471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
5481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// We try to match the spectral content (weighted) between source and
5491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// reconstructed samples.
5501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
55133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This code works but is *slower* than the inlined-asm version below
55233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
55333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// USE_INTRINSICS define.
55433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// With gcc-4.8, it's only slightly slower than the inlined.
55533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(USE_INTRINSICS)
55633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
55733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Zero extend an uint16x4_t 'v' to an int32x4_t.
55833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
55933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_s32_u32(vmovl_u16(v));
56033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
56133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
56233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Does a regular 4x4 transpose followed by an adjustment of the upper columns
56333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// in the inner rows to restore the source order of differences,
56433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// i.e., a0 - a1 | a3 - a2.
56533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {
56633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int32x4x4_t out = Transpose4x4(rows);
56733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // restore source order in the columns containing differences.
56833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x2_t r1h = vget_high_s32(out.val[1]);
56933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x2_t r2h = vget_high_s32(out.val[2]);
57033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);
57133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);
57233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return out;
57333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
57433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
57533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,
57633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                                   const uint8x8_t r2r3) {
57733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a0 = in[0] + in[2] | a1 = in[1] + in[3]
57833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);
57933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a3 = in[0] - in[2] | a2 = in[1] - in[3]
58033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);
58133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1));  // a0 + a1
58233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2));  // a3 + a2
58333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.
58433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3
58533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2
58633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8x2_t transpose =
58733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));
58833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // tmp[3] = a0 - a1 | tmp[2] = a3 - a2
58933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),
59033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                      vget_low_s16(transpose.val[1]));
59133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),
59233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                      vget_high_s16(transpose.val[1]));
59333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // [0]: tmp[3] [1]: tmp[2]
59433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);
59533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
59633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return res;
59733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
59833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
59933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {
60033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a0 = tmp[0 + i] + tmp[8 + i];
60133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);
60233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a1 = tmp[4 + i] + tmp[12+ i];
60333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);
60433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a2 = tmp[4 + i] - tmp[12+ i];
60533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);
60633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a3 = tmp[0 + i] - tmp[8 + i];
60733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);
60833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1));  // abs(a0 + a1)
60933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2));  // abs(a3 + a2)
61033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t b2 = vabdq_s32(a3, a2);              // abs(a3 - a2)
61133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t b3 = vabdq_s32(a0, a1);              // abs(a0 - a1)
61233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4x4_t res = { { b0, b1, b2, b3 } };
61333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return res;
61433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
61533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
61633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Calculate the weighted sum of the rows in 'b'.
61733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,
61833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                      const int32x4_t w0, const int32x4_t w1,
61933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                      const int32x4_t w2, const int32x4_t w3) {
62033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t s0 = vmulq_s32(w0, b.val[0]);
62133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);
62233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);
62333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);
62433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int64x2_t sum1 = vpaddlq_s32(s3);
62533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));
62633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return sum2;
62733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
62833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
62933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOAD_LANE_32b(src, VALUE, LANE) \
63033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
63133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
63233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Hadamard transform
63333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients.
63433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b,
63533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                    const uint16_t* const w) {
63633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x4_t d0d1 = { 0, 0, 0, 0 };
63733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x4_t d2d3 = { 0, 0, 0, 0 };
63833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_32b(a + 0 * BPS, d0d1, 0);  // a00 a01 a02 a03
63933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_32b(a + 1 * BPS, d0d1, 1);  // a10 a11 a12 a13
64033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_32b(b + 0 * BPS, d0d1, 2);  // b00 b01 b02 b03
64133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_32b(b + 1 * BPS, d0d1, 3);  // b10 b11 b12 b13
64233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_32b(a + 2 * BPS, d2d3, 0);  // a20 a21 a22 a23
64333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_32b(a + 3 * BPS, d2d3, 1);  // a30 a31 a32 a33
64433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_32b(b + 2 * BPS, d2d3, 2);  // b20 b21 b22 b23
64533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_32b(b + 3 * BPS, d2d3, 3);  // b30 b31 b32 b33
64633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
64733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
64833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31
64933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33
65033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint16x8x2_t tmp =
65133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));
65233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);
65333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);
65433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),
65533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                                    vget_low_u8(d2d3u8));
65633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),
65733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                                    vget_high_u8(d2d3u8));
65833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);
65933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);
66033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);
66133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
66233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
66333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
66433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
66533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
66633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
66733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
66833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
66933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                    vreinterpret_s32_s64(sum2));
67033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x2_t res = vshr_n_s32(diff, 5);
67133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    return vget_lane_s32(res, 0);
67233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
67333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
67433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
67533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef LOAD_LANE_32b
67633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
67733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else
67833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
6791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Hadamard transform
6801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients.
6811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b,
6821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                    const uint16_t* const w) {
6831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int kBPS = BPS;
6841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* A = a;
6851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* B = b;
6861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint16_t* W = w;
6871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  int sum;
6881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  __asm__ volatile (
6891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
6901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
6911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
6921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d2[1], [%[a]]            \n"
6931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
6941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
6951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
6961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
6971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d3[1], [%[b]]            \n"
6981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
6991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a d0/d2, b d1/d3
7001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d0/d1: 01 01 01 01
7011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2/d3: 23 23 23 23
7021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // But: it goes 01 45 23 67
7031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Notice the middle values are transposed
7041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         q0, q1                   \n"
7051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
7071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vaddl.u8        q2, d0, d2               \n"
7081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vaddl.u8        q10, d1, d3              \n"
7091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
7101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8        q3, d0, d2               \n"
7111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8        q11, d1, d3              \n"
7121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // tmp[0] = a0 + a1
7141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.s16      q0, q2                   \n"
7151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.s16      q8, q10                  \n"
7161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // tmp[1] = a3 + a2
7181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.s16      q1, q3                   \n"
7191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.s16      q9, q11                  \n"
7201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // No pair subtract
7221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q2 = {a0, a3}
7231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q3 = {a1, a2}
7241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         q2, q3                   \n"
7251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         q10, q11                 \n"
7261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
7281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.s16       q12, d4, d6              \n"
7291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.s16       q13, d5, d7              \n"
7301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.s16       q14, d20, d22            \n"
7311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.s16       q15, d21, d23            \n"
7321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // separate tmp[3] and tmp[2]
7341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q12 = tmp[3]
7351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q13 = tmp[2]
7361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q12, q13                 \n"
7371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q14, q15                 \n"
7381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Transpose tmp for a
7401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d1, d26                  \n" // vtrn.64
7411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d24                  \n" // vtrn.64
7421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q0, q1                   \n"
7431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q13, q12                 \n"
7441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Transpose tmp for b
7461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d17, d30                 \n" // vtrn.64
7471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d19, d28                 \n" // vtrn.64
7481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q8, q9                   \n"
7491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         q15, q14                 \n"
7501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // The first Q register is a, the second b.
7521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q0/8 tmp[0-3]
7531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q13/15 tmp[4-7]
7541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q1/9 tmp[8-11]
7551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q12/14 tmp[12-15]
7561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // These are still in 01 45 23 67 order. We fix it easily in the addition
7588b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    // case but the subtraction propagates them.
7591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d27                  \n"
7601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d19, d31                 \n"
7611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a0 = tmp[0] + tmp[8]
7631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q2, q0, q1               \n"
7641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q3, q8, q9               \n"
7651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a1 = tmp[4] + tmp[12]
7671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q10, q13, q12            \n"
7681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q11, q15, q14            \n"
7691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a2 = tmp[4] - tmp[12]
7711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q13, q13, q12            \n"
7721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q15, q15, q14            \n"
7731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // a3 = tmp[0] - tmp[8]
7751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q0, q0, q1               \n"
7761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q8, q8, q9               \n"
7771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // b0 = a0 + a1
7791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q1, q2, q10              \n"
7801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q9, q3, q11              \n"
7811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // b1 = a3 + a2
7831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q12, q0, q13             \n"
7841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s32        q14, q8, q15             \n"
7851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // b2 = a3 - a2
7871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q0, q0, q13              \n"
7881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q8, q8, q15              \n"
7891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // b3 = a0 - a1
7911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q2, q2, q10              \n"
7921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s32        q3, q3, q11              \n"
7931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.64         {q10, q11}, [%[w]]       \n"
7951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
7961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(b0)
7971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q1, q1                   \n"
7981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q9, q9                   \n"
7991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(b1)
8001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q12, q12                 \n"
8011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q14, q14                 \n"
8021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(b2)
8031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q0, q0                   \n"
8041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q8, q8                   \n"
8051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(b3)
8061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q2, q2                   \n"
8071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        q3, q3                   \n"
8081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // expand w before using.
8101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u16       q13, d20                 \n"
8111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u16       q15, d21                 \n"
8121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // w[0] * abs(b0)
8141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmul.u32        q1, q1, q13              \n"
8151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmul.u32        q9, q9, q13              \n"
8161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // w[4] * abs(b1)
8181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q1, q12, q15             \n"
8191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q9, q14, q15             \n"
8201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // expand w before using.
8221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u16       q13, d22                 \n"
8231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u16       q15, d23                 \n"
8241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // w[8] * abs(b1)
8261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q1, q0, q13              \n"
8271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q9, q8, q13              \n"
8281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // w[12] * abs(b1)
8301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q1, q2, q15              \n"
8311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmla.u32        q9, q3, q15              \n"
8321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Sum the arrays
8341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.u32      q1, q1                   \n"
8351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vpaddl.u32      q9, q9                   \n"
8361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.u64        d2, d3                   \n"
8371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.u64        d18, d19                 \n"
8381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Hadamard transform needs 4 bits of extra precision (2 bits in each
8401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
8411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
8421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // 16bits for w[] + 2 bits of abs() summation.
8431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    //
8441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
8451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // A-OK.
8461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // sum2 - sum1
8481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.u32        d0, d2, d18              \n"
8491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(sum2 - sum1)
8501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vabs.s32        d0, d0                   \n"
8511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(sum2 - sum1) >> 5
8521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.u32        d0, #5                   \n"
8531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // It would be better to move the value straight into r0 but I'm not
8551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // entirely sure how this works with inline assembly.
8561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmov.32         %[sum], d0[0]            \n"
8571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
8591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [kBPS] "r"(kBPS)
8601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
8611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
8621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  ) ;
8631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
8641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  return sum;
8651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
8661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
86733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // USE_INTRINSICS
86833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto16x16(const uint8_t* const a, const uint8_t* const b,
8701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                      const uint16_t* const w) {
8711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  int D = 0;
8721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  int x, y;
8731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
8741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    for (x = 0; x < 16; x += 4) {
8751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      D += Disto4x4(a + x + y, b + x + y, w);
8761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    }
8771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  }
8781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  return D;
8791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
8801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
88133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
88233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
88333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
88433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             int start_block, int end_block,
88533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             VP8Histogram* const histo) {
88633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
88733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int j;
88833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (j = start_block; j < end_block; ++j) {
88933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    int16_t out[16];
89033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
89133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    {
89233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      int k;
89333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const int16x8_t a0 = vld1q_s16(out + 0);
89433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const int16x8_t b0 = vld1q_s16(out + 8);
89533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
89633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
89733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
89833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
89933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
90033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
90133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
90233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
90333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      // Convert coefficients to bin.
90433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      for (k = 0; k < 16; ++k) {
90533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        histo->distribution[out[k]]++;
90633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      }
90733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    }
90833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
90933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
91033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
91133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
91233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
91333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
91433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                        const uint8_t* const b,
91533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                        uint32x4_t* const sum) {
91633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a0 = vld1q_u8(a);
91733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t b0 = vld1q_u8(b);
91833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
91933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
92033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
92133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
92233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
92333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
92433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Horizontal sum of all four uint32_t values in 'sum'.
92533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SumToInt(uint32x4_t sum) {
92633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint64x2_t sum2 = vpaddlq_u32(sum);
92733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
92833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return (int)sum3;
92933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
93033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
93133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE16x16(const uint8_t* a, const uint8_t* b) {
9328c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32x4_t sum = vdupq_n_u32(0);
93333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int y;
93433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (y = 0; y < 16; ++y) {
93533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
93633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
93733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return SumToInt(sum);
93833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
93933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
94033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE16x8(const uint8_t* a, const uint8_t* b) {
9418c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32x4_t sum = vdupq_n_u32(0);
94233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int y;
94333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (y = 0; y < 8; ++y) {
94433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
94533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
94633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return SumToInt(sum);
94733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
94833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
94933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE8x8(const uint8_t* a, const uint8_t* b) {
9508c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32x4_t sum = vdupq_n_u32(0);
95133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int y;
95233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (y = 0; y < 8; ++y) {
95333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x8_t a0 = vld1_u8(a + y * BPS);
95433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x8_t b0 = vld1_u8(b + y * BPS);
95533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x8_t abs_diff = vabd_u8(a0, b0);
95633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
95733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    sum = vpadalq_u16(sum, prod);
95833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
95933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return SumToInt(sum);
96033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
96133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
96233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE4x4(const uint8_t* a, const uint8_t* b) {
96333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a0 = Load4x4(a);
96433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t b0 = Load4x4(b);
96533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
96633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
96733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
96833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return SumToInt(vpaddlq_u16(prod));
96933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
97033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
97133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
97233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
9738c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Compilation with gcc-4.6.x is problematic for now.
9748c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#if !defined(WORK_AROUND_GCC)
97533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
97633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int16x8_t Quantize(int16_t* const in,
97733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                          const VP8Matrix* const mtx, int offset) {
97833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
97933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
98033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
98133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
98233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
98333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
98433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t a = vld1q_s16(in + offset);                // in
98533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
98633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
98733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
98833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
98933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
99033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
99133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
99233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
99333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
99433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
99533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
99633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
99733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
99833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1q_s16(in + offset, c4);
99933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
100033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return c3;
100133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
100233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
100333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const uint8_t kShuffles[4][8] = {
10048c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  { 0,   1,  2,  3,  8,  9, 16, 17 },
10058c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  { 10, 11,  4,  5,  6,  7, 12, 13 },
10068c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  { 18, 19, 24, 25, 26, 27, 20, 21 },
10078c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  { 14, 15, 22, 23, 28, 29, 30, 31 }
100833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora};
100933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
101033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int QuantizeBlock(int16_t in[16], int16_t out[16],
101133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                         const VP8Matrix* const mtx) {
101233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t out0 = Quantize(in, mtx, 0);
101333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t out1 = Quantize(in, mtx, 8);
10148c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint8x8x4_t shuffles;
10159e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
10169e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern  // non-standard versions there.
10179e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern#if defined(__APPLE__) && defined(__aarch64__) && \
10189e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
10198c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint8x16x2_t all_out;
10208c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
10218c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  INIT_VECTOR4(shuffles,
10228c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
10238c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
10248c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
10258c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
10268c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#else
102733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x8x4_t all_out;
102833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(all_out,
102933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vreinterpret_u8_s16(vget_low_s16(out0)),
103033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vreinterpret_u8_s16(vget_high_s16(out0)),
103133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vreinterpret_u8_s16(vget_low_s16(out1)),
103233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vreinterpret_u8_s16(vget_high_s16(out1)));
10338c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  INIT_VECTOR4(shuffles,
10348c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
10358c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
10368c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
10378c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
10388c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#endif
103933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Zigzag reordering
10408c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
10418c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
10428c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
10438c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
104433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // test zeros
104533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (*(uint64_t*)(out +  0) != 0) return 1;
104633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (*(uint64_t*)(out +  4) != 0) return 1;
104733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (*(uint64_t*)(out +  8) != 0) return 1;
104833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (*(uint64_t*)(out + 12) != 0) return 1;
104933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return 0;
105033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
105133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
10528c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#endif   // !WORK_AROUND_GCC
105333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
10541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif   // WEBP_USE_NEON
10551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
10571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Entry point
10581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10591e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroraextern void VP8EncDspInitNEON(void);
10601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10611e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroravoid VP8EncDspInitNEON(void) {
10621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON)
10631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8ITransform = ITransform;
10641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8FTransform = FTransform;
10651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8FTransformWHT = FTransformWHT;
10671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
10681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8TDisto4x4 = Disto4x4;
10691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8TDisto16x16 = Disto16x16;
107033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8CollectHistogram = CollectHistogram;
107133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SSE16x16 = SSE16x16;
107233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SSE16x8 = SSE16x8;
107333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SSE8x8 = SSE8x8;
107433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SSE4x4 = SSE4x4;
10758c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#if !defined(WORK_AROUND_GCC)
107633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8EncQuantizeBlock = QuantizeBlock;
107733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif
10781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif   // WEBP_USE_NEON
10791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
1080