11e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Copyright 2012 Google Inc. All Rights Reserved.
21e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
30406ce1417f76f2034833414dcecc9f56253640cVikas Arora// Use of this source code is governed by a BSD-style license
40406ce1417f76f2034833414dcecc9f56253640cVikas Arora// that can be found in the COPYING file in the root of the source
50406ce1417f76f2034833414dcecc9f56253640cVikas Arora// tree. An additional intellectual property rights grant can be found
60406ce1417f76f2034833414dcecc9f56253640cVikas Arora// in the file PATENTS. All contributing project authors may
70406ce1417f76f2034833414dcecc9f56253640cVikas Arora// be found in the AUTHORS file in the root of the source tree.
81e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// -----------------------------------------------------------------------------
91e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// ARM NEON version of speed-critical encoding functions.
111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from libvpx (http://www.webmproject.org/code/)
131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "./dsp.h"
151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON)
171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include <assert.h>
1933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
2033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./neon.h"
21fa39824bb690c5806358871f46940d0450973d8aJames Zern#include "../enc/vp8i_enc.h"
221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Transforms (Paragraph 14.4)
251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Inverse transform.
2733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This code is pretty much the same as TransformOne in the dec_neon.c, except
281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// for subtraction to *ref. See the comments there for algorithmic explanations.
2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const int16_t kC1 = 20091;
3133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
3233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This code works but is *slower* than the inlined-asm version below
3433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
357c8da7ce66017295a65ec028084b90800be377f8James Zern// WEBP_USE_INTRINSICS define.
3633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// With gcc-4.8, it's a little faster speed than inlined-assembly.
377c8da7ce66017295a65ec028084b90800be377f8James Zern#if defined(WEBP_USE_INTRINSICS)
3833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
4033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
4133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
4233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
4333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
4433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
4533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// to the corresponding rows of 'dst'.
4633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
4733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                            const int16x8_t dst01,
4833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                            const int16x8_t dst23) {
4933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Unsigned saturate to 8b.
5033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
5133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
5233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
5333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Store the results.
5433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
5533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
5633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
5733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
5833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
5933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
6033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
6133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                               const uint8_t* const ref, uint8_t* const dst) {
6233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x2_t dst01 = vdup_n_u32(0);
6333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x2_t dst23 = vdup_n_u32(0);
6433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
6533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Load the source pixels.
6633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
6733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
6833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
6933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
7033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
7133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
7233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Convert to 16b.
7333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
7433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
7533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
7633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Descale with rounding.
7733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
7833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
7933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Add the inverse transform.
8033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    SaturateAndStore4x4(dst, out01, out23);
8133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
8233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
8333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
8533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     int16x8x2_t* const out) {
8633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
8733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
8833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
8933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                                  // b0 d0 b1 d1 b2 d2 ...
9033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
9133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
9233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
9333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
9433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // {rows} = in0 | in4
9533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  //          in8 | in12
9633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // B1 = in4 | in12
9733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t B1 =
9833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
9933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // C0 = kC1 * in4 | kC1 * in12
10033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // C1 = kC2 * in4 | kC2 * in12
10133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
10233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
10333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
10433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                vget_low_s16(rows->val[1]));   // in0 + in8
10533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
10633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                vget_low_s16(rows->val[1]));   // in0 - in8
10733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // c = kC2 * in4 - kC1 * in12
10833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // d = kC1 * in4 + kC2 * in12
10933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
11033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
11133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
11233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
11333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
11433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
11533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
11633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Transpose8x2(E0, E1, rows);
11733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
11833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
11933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ITransformOne(const uint8_t* ref,
12033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                          const int16_t* in, uint8_t* dst) {
12133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int16x8x2_t rows;
12233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
12333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  TransformPass(&rows);
12433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  TransformPass(&rows);
12533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Add4x4(rows.val[0], rows.val[1], ref, dst);
12633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
12733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
12833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else
12933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
1301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransformOne(const uint8_t* ref,
1311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                          const int16_t* in, uint8_t* dst) {
1321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int kBPS = BPS;
13333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
1341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  __asm__ volatile (
1361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16         {q1, q2}, [%[in]]           \n"
1371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16         {d0}, [%[kC1C2]]            \n"
1381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2: in[0]
1401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3: in[8]
1411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4: in[4]
1421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5: in[12]
1431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d4                      \n"
1441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
1461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q9 = {in[4], in[12]} * kC2 >> 16
1471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d22 = a = in[0] + in[8]
1511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d23 = b = in[0] - in[8]
1521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
1531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
1541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    //  q8 = in[4]/[12] * kC1 >> 16
1561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
1571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Add {in[4], in[12]} back after the multiplication.
1591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
1601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d20 = c = in[4]*kC2 - in[12]*kC1
1621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d21 = d = in[4]*kC1 + in[12]*kC2
1631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
1641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
1651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2 = tmp[0] = a + d
1671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3 = tmp[1] = b + c
1681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4 = tmp[2] = b - c
1691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5 = tmp[3] = a - d
1701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
1711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
1721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
1731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
1741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
1761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
1771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vswp            d3, d4                      \n"
1791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
1821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d22 = a = tmp[0] + tmp[8]
1861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d23 = b = tmp[0] - tmp[8]
1871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
1881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
1891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
1911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
1921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d20 = c = in[4]*kC2 - in[12]*kC1
1941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d21 = d = in[4]*kC1 + in[12]*kC2
1951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
1961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
1971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d2 = tmp[0] = a + d
1991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d3 = tmp[1] = b + c
2001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d4 = tmp[2] = b - c
2011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d5 = tmp[3] = a - d
2021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
2031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
2041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
2051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
2061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
2081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
2091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
2101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
2111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
2131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // (val) + 4 >> 3
2151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d2, d2, #3                  \n"
2161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d3, d3, #3                  \n"
2171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d4, d4, #3                  \n"
2181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vrshr.s16       d5, d5, #3                  \n"
2191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
2211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vzip.16         q1, q2                      \n"
2221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Must accumulate before saturating
2241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u8        q8, d6                      \n"
2251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmovl.u8        q9, d7                      \n"
2261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q1, q1, q8                  \n"
2281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqadd.s16       q2, q2, q9                  \n"
2291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqmovun.s16     d0, q1                      \n"
2311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vqmovun.s16     d1, q2                      \n"
2321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
2341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
2351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
2361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.32         d1[1], [%[dst]]             \n"
2371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
2391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
2401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
2411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  );
2421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2447c8da7ce66017295a65ec028084b90800be377f8James Zern#endif    // WEBP_USE_INTRINSICS
24533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
2461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransform(const uint8_t* ref,
2471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                       const int16_t* in, uint8_t* dst, int do_two) {
2481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  ITransformOne(ref, in, dst);
2491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  if (do_two) {
2501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    ITransformOne(ref + 4, in + 16, dst + 4);
2511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  }
2521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
25433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Load all 4x4 pixels into a single uint8x16_t variable.
25533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic uint8x16_t Load4x4(const uint8_t* src) {
2568c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32x4_t out = vdupq_n_u32(0);
25733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
25833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
25933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
26033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
26133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_u8_u32(out);
2621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
2631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Forward transform.
2651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2667c8da7ce66017295a65ec028084b90800be377f8James Zern#if defined(WEBP_USE_INTRINSICS)
26733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
26833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
26933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         const int16x4_t C, const int16x4_t D,
27033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         int16x8_t* const out01,
27133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         int16x8_t* const out32) {
27233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4x2_t AB = vtrn_s16(A, B);
27333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4x2_t CD = vtrn_s16(C, D);
27433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
27533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     vreinterpret_s32_s16(CD.val[0]));
27633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
27733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     vreinterpret_s32_s16(CD.val[1]));
27833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *out01 = vreinterpretq_s16_s64(
27933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
28033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                   vreinterpret_s64_s32(tmp13.val[0])));
28133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *out32 = vreinterpretq_s16_s64(
28233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
28333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                   vreinterpret_s64_s32(tmp02.val[1])));
28433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
28533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
28633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
28733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         const uint8x8_t b) {
28833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_s16_u16(vsubl_u8(a, b));
28933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
29033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
29133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref,
29233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                       int16_t* out) {
29333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
29433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
29533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t S0 = Load4x4(src);
29633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t R0 = Load4x4(ref);
29733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
29833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
29933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t D0 = vget_low_s16(D0D1);
30033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t D1 = vget_high_s16(D0D1);
30133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t D2 = vget_low_s16(D2D3);
30233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t D3 = vget_high_s16(D2D3);
30333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
30433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
30533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {    // 1rst pass
30633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst937 = vdupq_n_s32(937);
30733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst1812 = vdupq_n_s32(1812);
30833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
30933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
31033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
31133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
31233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                    vget_high_s16(a0a1_2));
31333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
31433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                    vget_high_s16(a0a1_2));
31533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
31633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
31733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
31833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
31933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
32033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
32133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
32233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
32333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {    // 2nd pass
32433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
32533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
32633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst51000 = vdupq_n_s32(51000);
32733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
32833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
32933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
33033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
33133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
33233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
33333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
33433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
33533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
33633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
33733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
33833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t a3_eq_0 =
33933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
34033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
34133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  0, out0);
34233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  4, out1);
34333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  8, out2);
34433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out + 12, out3);
34533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
34633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
34733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
34833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else
34933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
3511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int16_t kCoeff16[] = {
3521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
3531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora};
3541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int32_t kCoeff32[] = {
3551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora   1812,  1812,  1812,  1812,
3561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    937,   937,   937,   937,
3571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  12000, 12000, 12000, 12000,
3581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  51000, 51000, 51000, 51000
3591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora};
3601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref,
3621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                       int16_t* out) {
3631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int kBPS = BPS;
3641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* src_ptr = src;
3651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const uint8_t* ref_ptr = ref;
3661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int16_t* coeff16 = kCoeff16;
3671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const int32_t* coeff32 = kCoeff32;
3681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  __asm__ volatile (
3701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load src into q4, q5 in high half
3711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
3721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
3731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
3741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d11}, [%[src_ptr]]               \n"
3751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load ref into q6, q7 in high half
3771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
3781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
3791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
3801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.8 {d15}, [%[ref_ptr]]               \n"
3811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Pack the high values in to q4 and q6
3831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32     q4, q5                       \n"
3841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32     q6, q7                       \n"
3851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // d[0-3] = src - ref
3871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8    q0, d8, d12                  \n"
3881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsubl.u8    q1, d9, d13                  \n"
3891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff16 into q8(d16=5352, d17=2217)
3911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.16     {q8}, [%[coeff16]]           \n"
3921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff32 high half into q9 = 1812, q10 = 937
3941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
3951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // load coeff32 low half into q11=12000, q12=51000
3971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
3981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // part 1
4001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // Transpose. Register dN is the same as dN in C
4011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d0, d2                   \n"
4021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d1, d3                   \n"
4031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d0, d1                   \n"
4041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d2, d3                   \n"
4051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
4071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
4081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
4091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
4101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d0, d4, d5               \n" // a0 + a1
4121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
4131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d2, d4, d5               \n" // a0 - a1
4141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
4151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
4171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
4181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
4191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
4201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
4221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
4231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d1, q9, #9               \n"
4241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d3, q10, #9              \n"
4251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // part 2
4271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
4281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d0, d2                   \n"
4291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.32         d1, d3                   \n"
4301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d0, d1                   \n"
4311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vtrn.16         d2, d3                   \n"
4321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmov.s16        d26, #7                  \n"
4341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
4361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
4371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
4381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d4, d4, d26              \n" // a1 + 7
4391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
4401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
4421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
4431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
4451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
4461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vceq.s16        d4, d7, #0               \n"
4481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        d0, d0, #4               \n"
4501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshr.s16        d2, d2, #4               \n"
4511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
4531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
4541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4550406ce1417f76f2034833414dcecc9f56253640cVikas Arora    "vmvn            d4, d4                   \n" // !(d1 == 0)
4561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
4571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d1, q11, #16             \n"
4581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[4] += (d1!=0)
4591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vsub.s16        d1, d1, d4               \n"
4601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
4611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vshrn.s32       d3, q12, #16             \n"
4621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
4631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // set result to out array
4641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    "vst1.16         {q0, q1}, [%[out]]   \n"
4651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
4661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      [coeff32] "+r"(coeff32)          // modified registers
4671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
4681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      [out] "r"(out)                   // constants
4691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
4701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      "q10", "q11", "q12", "q13"       // clobbered
4711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  );
4721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
4731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
47433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif
47533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
47633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOAD_LANE_16b(VALUE, LANE) do {             \
47733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
47833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  src += stride;                                    \
47933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} while (0)
48033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
48133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void FTransformWHT(const int16_t* src, int16_t* out) {
48233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int stride = 16;
48333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t zero = vdup_n_s16(0);
48433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int32x4x4_t tmp0;
48533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int16x4x4_t in;
48633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(in, zero, zero, zero, zero);
48733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[0], 0);
48833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[1], 0);
48933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[2], 0);
49033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[3], 0);
49133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[0], 1);
49233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[1], 1);
49333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[2], 1);
49433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[3], 1);
49533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[0], 2);
49633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[1], 2);
49733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[2], 2);
49833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[3], 2);
49933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[0], 3);
50033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[1], 3);
50133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[2], 3);
50233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOAD_LANE_16b(in.val[3], 3);
50333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
50433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
50533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a0 = in[0 * 16] + in[2 * 16]
50633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a1 = in[1 * 16] + in[3 * 16]
50733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a2 = in[1 * 16] - in[3 * 16]
50833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a3 = in[0 * 16] - in[2 * 16]
50933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
51033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
51133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
51233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
51333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp0.val[0] = vaddq_s32(a0, a1);
51433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp0.val[1] = vaddq_s32(a3, a2);
51533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp0.val[2] = vsubq_s32(a3, a2);
51633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp0.val[3] = vsubq_s32(a0, a1);
51733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
51833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
51933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
52033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a0 = tmp[0 + i] + tmp[ 8 + i]
52133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a1 = tmp[4 + i] + tmp[12 + i]
52233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a2 = tmp[4 + i] - tmp[12 + i]
52333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // a3 = tmp[0 + i] - tmp[ 8 + i]
52433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
52533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
52633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
52733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
52833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
52933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
53033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
53133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
53233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out0 = vmovn_s32(b0);
53333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out1 = vmovn_s32(b1);
53433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out2 = vmovn_s32(b2);
53533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t out3 = vmovn_s32(b3);
53633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
53733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  0, out0);
53833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  4, out1);
53933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out +  8, out2);
54033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    vst1_s16(out + 12, out3);
54133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
5421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
54333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef LOAD_LANE_16b
5441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
5451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
5461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Texture distortion
5471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//
5481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// We try to match the spectral content (weighted) between source and
5491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// reconstructed samples.
5501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
5517c8da7ce66017295a65ec028084b90800be377f8James Zern// a 0123, b 0123
5527c8da7ce66017295a65ec028084b90800be377f8James Zern// a 4567, b 4567
5537c8da7ce66017295a65ec028084b90800be377f8James Zern// a 89ab, b 89ab
5547c8da7ce66017295a65ec028084b90800be377f8James Zern// a cdef, b cdef
5557c8da7ce66017295a65ec028084b90800be377f8James Zern//
5567c8da7ce66017295a65ec028084b90800be377f8James Zern// transpose
5577c8da7ce66017295a65ec028084b90800be377f8James Zern//
5587c8da7ce66017295a65ec028084b90800be377f8James Zern// a 048c, b 048c
5597c8da7ce66017295a65ec028084b90800be377f8James Zern// a 159d, b 159d
5607c8da7ce66017295a65ec028084b90800be377f8James Zern// a 26ae, b 26ae
5617c8da7ce66017295a65ec028084b90800be377f8James Zern// a 37bf, b 37bf
5627c8da7ce66017295a65ec028084b90800be377f8James Zern//
5637c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
5647c8da7ce66017295a65ec028084b90800be377f8James Zern  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
5657c8da7ce66017295a65ec028084b90800be377f8James Zern  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
5667c8da7ce66017295a65ec028084b90800be377f8James Zern  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
5677c8da7ce66017295a65ec028084b90800be377f8James Zern                                        vreinterpretq_s32_s16(q2_tmp1.val[0]));
5687c8da7ce66017295a65ec028084b90800be377f8James Zern  const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
5697c8da7ce66017295a65ec028084b90800be377f8James Zern                                        vreinterpretq_s32_s16(q2_tmp1.val[1]));
5707c8da7ce66017295a65ec028084b90800be377f8James Zern  q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
5717c8da7ce66017295a65ec028084b90800be377f8James Zern  q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
5727c8da7ce66017295a65ec028084b90800be377f8James Zern  q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
5737c8da7ce66017295a65ec028084b90800be377f8James Zern  q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
5747c8da7ce66017295a65ec028084b90800be377f8James Zern  return q4_in;
5757c8da7ce66017295a65ec028084b90800be377f8James Zern}
5767c8da7ce66017295a65ec028084b90800be377f8James Zern
5770912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zernstatic WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
5787c8da7ce66017295a65ec028084b90800be377f8James Zern  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
5797c8da7ce66017295a65ec028084b90800be377f8James Zern  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
5800912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
5810912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
5820912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
5830912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
5847c8da7ce66017295a65ec028084b90800be377f8James Zern  int16x8x4_t q4_out;
5857c8da7ce66017295a65ec028084b90800be377f8James Zern  // tmp[0] = a0 + a1
5867c8da7ce66017295a65ec028084b90800be377f8James Zern  // tmp[1] = a3 + a2
5877c8da7ce66017295a65ec028084b90800be377f8James Zern  // tmp[2] = a3 - a2
5887c8da7ce66017295a65ec028084b90800be377f8James Zern  // tmp[3] = a0 - a1
5897c8da7ce66017295a65ec028084b90800be377f8James Zern  INIT_VECTOR4(q4_out,
5900912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern               vabsq_s16(vaddq_s16(q_a0, q_a1)),
5910912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern               vabsq_s16(vaddq_s16(q_a3, q_a2)),
5920912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern               vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
5937c8da7ce66017295a65ec028084b90800be377f8James Zern  return q4_out;
5947c8da7ce66017295a65ec028084b90800be377f8James Zern}
5957c8da7ce66017295a65ec028084b90800be377f8James Zern
5960912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zernstatic WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
5970912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
5980912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern                                                        q4_in.val[2]));
5990912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
6000912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern                                                        q4_in.val[3]));
6010912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
6020912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern                                                        q4_in.val[3]));
6030912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
6040912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern                                                        q4_in.val[2]));
6050912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  int16x8x4_t q4_out;
6067c8da7ce66017295a65ec028084b90800be377f8James Zern
6070912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  INIT_VECTOR4(q4_out,
6080912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
6090912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
6100912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  return q4_out;
6117c8da7ce66017295a65ec028084b90800be377f8James Zern}
6127c8da7ce66017295a65ec028084b90800be377f8James Zern
6137c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
6147c8da7ce66017295a65ec028084b90800be377f8James Zern  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
6157c8da7ce66017295a65ec028084b90800be377f8James Zern  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
6167c8da7ce66017295a65ec028084b90800be377f8James Zern  int16x4x4_t d4_w;
6177c8da7ce66017295a65ec028084b90800be377f8James Zern  INIT_VECTOR4(d4_w,
6187c8da7ce66017295a65ec028084b90800be377f8James Zern               vget_low_s16(vreinterpretq_s16_u16(q_w07)),
6197c8da7ce66017295a65ec028084b90800be377f8James Zern               vget_high_s16(vreinterpretq_s16_u16(q_w07)),
6207c8da7ce66017295a65ec028084b90800be377f8James Zern               vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
6217c8da7ce66017295a65ec028084b90800be377f8James Zern               vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
6227c8da7ce66017295a65ec028084b90800be377f8James Zern  return d4_w;
6237c8da7ce66017295a65ec028084b90800be377f8James Zern}
6247c8da7ce66017295a65ec028084b90800be377f8James Zern
6257c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
6267c8da7ce66017295a65ec028084b90800be377f8James Zern                                      const int16x4x4_t d4_w) {
6277c8da7ce66017295a65ec028084b90800be377f8James Zern  int32x2_t d_sum;
6287c8da7ce66017295a65ec028084b90800be377f8James Zern  // sum += w[ 0] * abs(b0);
6297c8da7ce66017295a65ec028084b90800be377f8James Zern  // sum += w[ 4] * abs(b1);
6307c8da7ce66017295a65ec028084b90800be377f8James Zern  // sum += w[ 8] * abs(b2);
6317c8da7ce66017295a65ec028084b90800be377f8James Zern  // sum += w[12] * abs(b3);
6327c8da7ce66017295a65ec028084b90800be377f8James Zern  int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
6337c8da7ce66017295a65ec028084b90800be377f8James Zern  int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
6347c8da7ce66017295a65ec028084b90800be377f8James Zern  int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
6357c8da7ce66017295a65ec028084b90800be377f8James Zern  int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
6367c8da7ce66017295a65ec028084b90800be377f8James Zern  q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
6377c8da7ce66017295a65ec028084b90800be377f8James Zern  q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
6387c8da7ce66017295a65ec028084b90800be377f8James Zern  q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
6397c8da7ce66017295a65ec028084b90800be377f8James Zern  q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
6407c8da7ce66017295a65ec028084b90800be377f8James Zern
6417c8da7ce66017295a65ec028084b90800be377f8James Zern  q_sum0 = vaddq_s32(q_sum0, q_sum1);
6427c8da7ce66017295a65ec028084b90800be377f8James Zern  q_sum2 = vaddq_s32(q_sum2, q_sum3);
6437c8da7ce66017295a65ec028084b90800be377f8James Zern  q_sum2 = vaddq_s32(q_sum0, q_sum2);
6447c8da7ce66017295a65ec028084b90800be377f8James Zern  d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
6457c8da7ce66017295a65ec028084b90800be377f8James Zern  d_sum = vpadd_s32(d_sum, d_sum);
6467c8da7ce66017295a65ec028084b90800be377f8James Zern  return d_sum;
64733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
64833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
64933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOAD_LANE_32b(src, VALUE, LANE) \
6507c8da7ce66017295a65ec028084b90800be377f8James Zern    (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
65133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
65233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Hadamard transform
65333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients.
6540912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern// w[] contains a row-major 4 by 4 symmetric matrix.
65533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b,
65633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                    const uint16_t* const w) {
6577c8da7ce66017295a65ec028084b90800be377f8James Zern  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
6587c8da7ce66017295a65ec028084b90800be377f8James Zern  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
6597c8da7ce66017295a65ec028084b90800be377f8James Zern  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
6607c8da7ce66017295a65ec028084b90800be377f8James Zern  uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
6617c8da7ce66017295a65ec028084b90800be377f8James Zern  uint8x8x4_t d4_in;
6627c8da7ce66017295a65ec028084b90800be377f8James Zern
6637c8da7ce66017295a65ec028084b90800be377f8James Zern  // load data a, b
6647c8da7ce66017295a65ec028084b90800be377f8James Zern  LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
6657c8da7ce66017295a65ec028084b90800be377f8James Zern  LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
6667c8da7ce66017295a65ec028084b90800be377f8James Zern  LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
6677c8da7ce66017295a65ec028084b90800be377f8James Zern  LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
6687c8da7ce66017295a65ec028084b90800be377f8James Zern  LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
6697c8da7ce66017295a65ec028084b90800be377f8James Zern  LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
6707c8da7ce66017295a65ec028084b90800be377f8James Zern  LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
6717c8da7ce66017295a65ec028084b90800be377f8James Zern  LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
6727c8da7ce66017295a65ec028084b90800be377f8James Zern  INIT_VECTOR4(d4_in,
6737c8da7ce66017295a65ec028084b90800be377f8James Zern               vreinterpret_u8_u32(d_in_ab_0123),
6747c8da7ce66017295a65ec028084b90800be377f8James Zern               vreinterpret_u8_u32(d_in_ab_4567),
6757c8da7ce66017295a65ec028084b90800be377f8James Zern               vreinterpret_u8_u32(d_in_ab_89ab),
6767c8da7ce66017295a65ec028084b90800be377f8James Zern               vreinterpret_u8_u32(d_in_ab_cdef));
67733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
67833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
6790912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // Vertical pass first to avoid a transpose (vertical and horizontal passes
6800912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // are commutative because w/kWeightY is symmetric) and subsequent
6810912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // transpose.
6820912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
6837c8da7ce66017295a65ec028084b90800be377f8James Zern    const int16x4x4_t d4_w = DistoLoadW(w);
6840912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // horizontal pass
6850912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
6860912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
6870912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    int32x2_t d_sum = DistoSum(q4_h, d4_w);
68833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
6891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    // abs(sum2 - sum1) >> 5
6907c8da7ce66017295a65ec028084b90800be377f8James Zern    d_sum = vabs_s32(d_sum);
6910912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    d_sum = vshr_n_s32(d_sum, 5);
6927c8da7ce66017295a65ec028084b90800be377f8James Zern    return vget_lane_s32(d_sum, 0);
6937c8da7ce66017295a65ec028084b90800be377f8James Zern  }
6941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
6957c8da7ce66017295a65ec028084b90800be377f8James Zern#undef LOAD_LANE_32b
69633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
6971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto16x16(const uint8_t* const a, const uint8_t* const b,
6981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                      const uint16_t* const w) {
6991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  int D = 0;
7001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  int x, y;
7011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
7021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    for (x = 0; x < 16; x += 4) {
7031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora      D += Disto4x4(a + x + y, b + x + y, w);
7041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    }
7051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  }
7061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  return D;
7071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
7081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
70933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
71033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
71133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
71233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             int start_block, int end_block,
71333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                             VP8Histogram* const histo) {
71433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
71533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int j;
7167c8da7ce66017295a65ec028084b90800be377f8James Zern  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
71733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (j = start_block; j < end_block; ++j) {
71833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    int16_t out[16];
71933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
72033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    {
72133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      int k;
72233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const int16x8_t a0 = vld1q_s16(out + 0);
72333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const int16x8_t b0 = vld1q_s16(out + 8);
72433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
72533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
72633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
72733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
72833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
72933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
73033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
73133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
73233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      // Convert coefficients to bin.
73333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      for (k = 0; k < 16; ++k) {
7347c8da7ce66017295a65ec028084b90800be377f8James Zern        ++distribution[out[k]];
73533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      }
73633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    }
73733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
7387c8da7ce66017295a65ec028084b90800be377f8James Zern  VP8SetHistogramData(distribution, histo);
73933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
74033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
74133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
74233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
74333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
74433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                        const uint8_t* const b,
74533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                        uint32x4_t* const sum) {
74633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a0 = vld1q_u8(a);
74733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t b0 = vld1q_u8(b);
74833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
749fa39824bb690c5806358871f46940d0450973d8aJames Zern  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
750fa39824bb690c5806358871f46940d0450973d8aJames Zern                                    vget_low_u8(abs_diff));
751fa39824bb690c5806358871f46940d0450973d8aJames Zern  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
752fa39824bb690c5806358871f46940d0450973d8aJames Zern                                    vget_high_u8(abs_diff));
753fa39824bb690c5806358871f46940d0450973d8aJames Zern  /* pair-wise adds and widen */
754fa39824bb690c5806358871f46940d0450973d8aJames Zern  const uint32x4_t sum1 = vpaddlq_u16(prod1);
755fa39824bb690c5806358871f46940d0450973d8aJames Zern  const uint32x4_t sum2 = vpaddlq_u16(prod2);
756fa39824bb690c5806358871f46940d0450973d8aJames Zern  *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
75733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
75833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
75933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Horizontal sum of all four uint32_t values in 'sum'.
76033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SumToInt(uint32x4_t sum) {
76133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint64x2_t sum2 = vpaddlq_u32(sum);
76233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
76333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return (int)sum3;
76433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
76533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
766fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
7678c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32x4_t sum = vdupq_n_u32(0);
76833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int y;
76933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (y = 0; y < 16; ++y) {
77033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
77133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
77233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return SumToInt(sum);
77333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
77433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
775fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
7768c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32x4_t sum = vdupq_n_u32(0);
77733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int y;
77833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (y = 0; y < 8; ++y) {
77933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
78033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
78133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return SumToInt(sum);
78233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
78333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
784fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
7858c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint32x4_t sum = vdupq_n_u32(0);
78633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int y;
78733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (y = 0; y < 8; ++y) {
78833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x8_t a0 = vld1_u8(a + y * BPS);
78933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x8_t b0 = vld1_u8(b + y * BPS);
79033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x8_t abs_diff = vabd_u8(a0, b0);
79133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
79233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    sum = vpadalq_u16(sum, prod);
79333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
79433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return SumToInt(sum);
79533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
79633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
797fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
79833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a0 = Load4x4(a);
79933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t b0 = Load4x4(b);
80033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
801fa39824bb690c5806358871f46940d0450973d8aJames Zern  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
802fa39824bb690c5806358871f46940d0450973d8aJames Zern                                    vget_low_u8(abs_diff));
803fa39824bb690c5806358871f46940d0450973d8aJames Zern  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
804fa39824bb690c5806358871f46940d0450973d8aJames Zern                                    vget_high_u8(abs_diff));
805fa39824bb690c5806358871f46940d0450973d8aJames Zern  /* pair-wise adds and widen */
806fa39824bb690c5806358871f46940d0450973d8aJames Zern  const uint32x4_t sum1 = vpaddlq_u16(prod1);
807fa39824bb690c5806358871f46940d0450973d8aJames Zern  const uint32x4_t sum2 = vpaddlq_u16(prod2);
808fa39824bb690c5806358871f46940d0450973d8aJames Zern  return SumToInt(vaddq_u32(sum1, sum2));
80933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
81033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
81133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
81233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8138c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Compilation with gcc-4.6.x is problematic for now.
8148c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#if !defined(WORK_AROUND_GCC)
81533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
81633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int16x8_t Quantize(int16_t* const in,
81733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                          const VP8Matrix* const mtx, int offset) {
81833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
81933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
82033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
82133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
82233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
82333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
82433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t a = vld1q_s16(in + offset);                // in
82533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
82633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
82733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
82833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
82933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
83033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
83133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
83233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
83333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
83433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
83533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
83633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
83733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
83833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1q_s16(in + offset, c4);
83933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
84033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return c3;
84133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
84233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
84333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const uint8_t kShuffles[4][8] = {
8448c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  { 0,   1,  2,  3,  8,  9, 16, 17 },
8458c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  { 10, 11,  4,  5,  6,  7, 12, 13 },
8468c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  { 18, 19, 24, 25, 26, 27, 20, 21 },
8478c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  { 14, 15, 22, 23, 28, 29, 30, 31 }
84833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora};
84933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
85033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int QuantizeBlock(int16_t in[16], int16_t out[16],
85133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                         const VP8Matrix* const mtx) {
85233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t out0 = Quantize(in, mtx, 0);
85333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t out1 = Quantize(in, mtx, 8);
8548c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint8x8x4_t shuffles;
8559e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
8569e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern  // non-standard versions there.
8579e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern#if defined(__APPLE__) && defined(__aarch64__) && \
8589e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
8598c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  uint8x16x2_t all_out;
8608c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
8618c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  INIT_VECTOR4(shuffles,
8628c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
8638c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
8648c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
8658c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
8668c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#else
86733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x8x4_t all_out;
86833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(all_out,
86933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vreinterpret_u8_s16(vget_low_s16(out0)),
87033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vreinterpret_u8_s16(vget_high_s16(out0)),
87133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vreinterpret_u8_s16(vget_low_s16(out1)),
87233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vreinterpret_u8_s16(vget_high_s16(out1)));
8738c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  INIT_VECTOR4(shuffles,
8748c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
8758c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
8768c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
8778c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
8788c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#endif
87933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Zigzag reordering
8808c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
8818c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
8828c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
8838c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
88433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // test zeros
88533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (*(uint64_t*)(out +  0) != 0) return 1;
88633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (*(uint64_t*)(out +  4) != 0) return 1;
88733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (*(uint64_t*)(out +  8) != 0) return 1;
88833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (*(uint64_t*)(out + 12) != 0) return 1;
88933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return 0;
89033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
89133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8927c8da7ce66017295a65ec028084b90800be377f8James Zernstatic int Quantize2Blocks(int16_t in[32], int16_t out[32],
8937c8da7ce66017295a65ec028084b90800be377f8James Zern                           const VP8Matrix* const mtx) {
8947c8da7ce66017295a65ec028084b90800be377f8James Zern  int nz;
8957c8da7ce66017295a65ec028084b90800be377f8James Zern  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
8967c8da7ce66017295a65ec028084b90800be377f8James Zern  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
8977c8da7ce66017295a65ec028084b90800be377f8James Zern  return nz;
8987c8da7ce66017295a65ec028084b90800be377f8James Zern}
89933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
9007c8da7ce66017295a65ec028084b90800be377f8James Zern#endif   // !WORK_AROUND_GCC
9011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
9021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
9031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Entry point
9041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
9051e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroraextern void VP8EncDspInitNEON(void);
9061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
9077c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
9081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8ITransform = ITransform;
9091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8FTransform = FTransform;
9101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
9111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8FTransformWHT = FTransformWHT;
9121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
9131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8TDisto4x4 = Disto4x4;
9141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8TDisto16x16 = Disto16x16;
91533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8CollectHistogram = CollectHistogram;
916fa39824bb690c5806358871f46940d0450973d8aJames Zern
917fa39824bb690c5806358871f46940d0450973d8aJames Zern  VP8SSE16x16 = SSE16x16_NEON;
918fa39824bb690c5806358871f46940d0450973d8aJames Zern  VP8SSE16x8 = SSE16x8_NEON;
919fa39824bb690c5806358871f46940d0450973d8aJames Zern  VP8SSE8x8 = SSE8x8_NEON;
920fa39824bb690c5806358871f46940d0450973d8aJames Zern  VP8SSE4x4 = SSE4x4_NEON;
921fa39824bb690c5806358871f46940d0450973d8aJames Zern
9228c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#if !defined(WORK_AROUND_GCC)
92333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8EncQuantizeBlock = QuantizeBlock;
9247c8da7ce66017295a65ec028084b90800be377f8James Zern  VP8EncQuantize2Blocks = Quantize2Blocks;
92533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif
9261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
9277c8da7ce66017295a65ec028084b90800be377f8James Zern
9287c8da7ce66017295a65ec028084b90800be377f8James Zern#else  // !WEBP_USE_NEON
9297c8da7ce66017295a65ec028084b90800be377f8James Zern
9307c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
9317c8da7ce66017295a65ec028084b90800be377f8James Zern
9327c8da7ce66017295a65ec028084b90800be377f8James Zern#endif  // WEBP_USE_NEON
933