1a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// Copyright 2012 Google Inc. All Rights Reserved.
2a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//
30406ce1417f76f2034833414dcecc9f56253640cVikas Arora// Use of this source code is governed by a BSD-style license
40406ce1417f76f2034833414dcecc9f56253640cVikas Arora// that can be found in the COPYING file in the root of the source
50406ce1417f76f2034833414dcecc9f56253640cVikas Arora// tree. An additional intellectual property rights grant can be found
60406ce1417f76f2034833414dcecc9f56253640cVikas Arora// in the file PATENTS. All contributing project authors may
70406ce1417f76f2034833414dcecc9f56253640cVikas Arora// be found in the AUTHORS file in the root of the source tree.
8a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// -----------------------------------------------------------------------------
9a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//
10a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// ARM NEON version of dsp functions and loop filtering.
11a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//
12a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// Authors: Somnath Banerjee (somnath@google.com)
13a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//          Johann Koenig (johannkoenig@google.com)
14a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
15a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#include "./dsp.h"
16a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON)
181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./neon.h"
201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "../dec/vp8i.h"
211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
2233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
2333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// NxM Loading functions
2433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
2533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Load/Store vertical edge
2633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
2733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
2833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
3133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
3233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
3333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
3433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
3533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
3633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define STORE8x2(c1, c2, p, stride)                                            \
3733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
3833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
3933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
4033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
4133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
4233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
4333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
4433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
4533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
4633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if !defined(WORK_AROUND_GCC)
4733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
4833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
4933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// (register alloc, probably). The variants somewhat mitigate the problem, but
5033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// not quite. HFilter16i() remains problematic.
5133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
5233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8_t zero = vdup_n_u8(0);
5333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x8x4_t out;
5433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(out, zero, zero, zero, zero);
5533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld4_lane_u8(src + 0 * stride, out, 0);
5633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld4_lane_u8(src + 1 * stride, out, 1);
5733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld4_lane_u8(src + 2 * stride, out, 2);
5833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld4_lane_u8(src + 3 * stride, out, 3);
5933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld4_lane_u8(src + 4 * stride, out, 4);
6033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld4_lane_u8(src + 5 * stride, out, 5);
6133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld4_lane_u8(src + 6 * stride, out, 6);
6233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  out = vld4_lane_u8(src + 7 * stride, out, 7);
6333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return out;
6433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
6533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
6633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
6733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const p1, uint8x16_t* const p0,
6833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const q0, uint8x16_t* const q1) {
6933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
7033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
7133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
7233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
7333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p1 = vcombine_u8(row0.val[0], row8.val[0]);
7433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p0 = vcombine_u8(row0.val[1], row8.val[1]);
7533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q0 = vcombine_u8(row0.val[2], row8.val[2]);
7633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q1 = vcombine_u8(row0.val[3], row8.val[3]);
7733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
7833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
7933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else  // WORK_AROUND_GCC
8033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOADQ_LANE_32b(VALUE, LANE) do {                             \
8233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
8333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  src += stride;                                                     \
8433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} while (0)
8533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
8633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
8733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const p1, uint8x16_t* const p0,
8833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const q0, uint8x16_t* const q1) {
8933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4_t zero = vdupq_n_u32(0);
9033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x4x4_t in;
9133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(in, zero, zero, zero, zero);
9233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  src -= 2;
9333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[0], 0);
9433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[1], 0);
9533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[2], 0);
9633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[3], 0);
9733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[0], 1);
9833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[1], 1);
9933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[2], 1);
10033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[3], 1);
10133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[0], 2);
10233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[1], 2);
10333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[2], 2);
10433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[3], 2);
10533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[0], 3);
10633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[1], 3);
10733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[2], 3);
10833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  LOADQ_LANE_32b(in.val[3], 3);
10933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Transpose four 4x4 parts:
11033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
11133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
11233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                        vreinterpretq_u8_u32(in.val[1]));
11333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
11433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                        vreinterpretq_u8_u32(in.val[3]));
11533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
11633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         vreinterpretq_u16_u8(row23.val[0]));
11733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
11833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         vreinterpretq_u16_u8(row23.val[1]));
11933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    *p1 = vreinterpretq_u8_u16(row02.val[0]);
12033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    *p0 = vreinterpretq_u8_u16(row13.val[0]);
12133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    *q0 = vreinterpretq_u8_u16(row02.val[1]);
12233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    *q1 = vreinterpretq_u8_u16(row13.val[1]);
12333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
12433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
12533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef LOADQ_LANE_32b
12633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
12733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // !WORK_AROUND_GCC
12833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
12933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
13033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const p3, uint8x16_t* const p2,
13133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const p1, uint8x16_t* const p0,
13233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const q0, uint8x16_t* const q1,
13333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const q2, uint8x16_t* const q3) {
13433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load4x16(src - 2, stride, p3, p2, p1, p0);
13533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load4x16(src + 2, stride, q0, q1, q2, q3);
13633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
13733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
13833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
13933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const p1, uint8x16_t* const p0,
14033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const q0, uint8x16_t* const q1) {
14133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p1 = vld1q_u8(src - 2 * stride);
14233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p0 = vld1q_u8(src - 1 * stride);
14333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q0 = vld1q_u8(src + 0 * stride);
14433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q1 = vld1q_u8(src + 1 * stride);
14533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
14633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
14733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
14833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const p3, uint8x16_t* const p2,
14933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const p1, uint8x16_t* const p0,
15033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const q0, uint8x16_t* const q1,
15133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8x16_t* const q2, uint8x16_t* const q3) {
15233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load16x4(src - 2  * stride, stride, p3, p2, p1, p0);
15333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
15433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
15533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
15633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Load8x8x2(const uint8_t* const u,
15733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  const uint8_t* const v,
15833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  int stride,
15933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  uint8x16_t* const p3, uint8x16_t* const p2,
16033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  uint8x16_t* const p1, uint8x16_t* const p0,
16133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  uint8x16_t* const q0, uint8x16_t* const q1,
16233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  uint8x16_t* const q2, uint8x16_t* const q3) {
16333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
16433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // and the v-samples on the higher half.
16533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
16633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
16733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
16833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
16933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
17033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
17133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
17233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
17333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
17433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
17533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if !defined(WORK_AROUND_GCC)
17633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
17733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOAD_UV_8(ROW) \
17833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
17933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
18033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
18133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   const uint8_t* const v,
18233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   int stride,
18333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   uint8x16_t* const p3, uint8x16_t* const p2,
18433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   uint8x16_t* const p1, uint8x16_t* const p0,
18533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   uint8x16_t* const q0, uint8x16_t* const q1,
18633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   uint8x16_t* const q2, uint8x16_t* const q3) {
18733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
18833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // and the v-samples on the higher half.
18933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t row0 = LOAD_UV_8(0);
19033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t row1 = LOAD_UV_8(1);
19133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t row2 = LOAD_UV_8(2);
19233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t row3 = LOAD_UV_8(3);
19333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t row4 = LOAD_UV_8(4);
19433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t row5 = LOAD_UV_8(5);
19533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t row6 = LOAD_UV_8(6);
19633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t row7 = LOAD_UV_8(7);
19733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Perform two side-by-side 8x8 transposes
19833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
19933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
20033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
20133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
20233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
20333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
20433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
20533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
20633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
20733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                                    // u01 u11 u03 u13 ...
20833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
20933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                                    // u21 u31 u23 u33 ...
21033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
21133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
21233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
21333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       vreinterpretq_u16_u8(row23.val[0]));
21433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
21533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       vreinterpretq_u16_u8(row23.val[1]));
21633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
21733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       vreinterpretq_u16_u8(row67.val[0]));
21833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
21933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       vreinterpretq_u16_u8(row67.val[1]));
22033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
22133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       vreinterpretq_u32_u16(row46.val[0]));
22233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
22333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       vreinterpretq_u32_u16(row46.val[1]));
22433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
22533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       vreinterpretq_u32_u16(row57.val[0]));
22633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
22733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       vreinterpretq_u32_u16(row57.val[1]));
22833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p3 = vreinterpretq_u8_u32(row04.val[0]);
22933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p2 = vreinterpretq_u8_u32(row15.val[0]);
23033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p1 = vreinterpretq_u8_u32(row26.val[0]);
23133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *p0 = vreinterpretq_u8_u32(row37.val[0]);
23233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q0 = vreinterpretq_u8_u32(row04.val[1]);
23333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q1 = vreinterpretq_u8_u32(row15.val[1]);
23433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q2 = vreinterpretq_u8_u32(row26.val[1]);
23533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *q3 = vreinterpretq_u8_u32(row37.val[1]);
23633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
23733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef LOAD_UV_8
23833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
23933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // !WORK_AROUND_GCC
24033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
24133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store2x8(const uint8x8x2_t v,
24233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8_t* const dst, int stride) {
24333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst2_lane_u8(dst + 0 * stride, v, 0);
24433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst2_lane_u8(dst + 1 * stride, v, 1);
24533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst2_lane_u8(dst + 2 * stride, v, 2);
24633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst2_lane_u8(dst + 3 * stride, v, 3);
24733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst2_lane_u8(dst + 4 * stride, v, 4);
24833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst2_lane_u8(dst + 5 * stride, v, 5);
24933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst2_lane_u8(dst + 6 * stride, v, 6);
25033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst2_lane_u8(dst + 7 * stride, v, 7);
25133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
25233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
25333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
25433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  uint8_t* const dst, int stride) {
25533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x8x2_t lo, hi;
25633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  lo.val[0] = vget_low_u8(p0);
25733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  lo.val[1] = vget_low_u8(q0);
25833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  hi.val[0] = vget_high_u8(p0);
25933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  hi.val[1] = vget_high_u8(q0);
26033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store2x8(lo, dst - 1 + 0 * stride, stride);
26133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store2x8(hi, dst - 1 + 8 * stride, stride);
26233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
26333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
26433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if !defined(WORK_AROUND_GCC)
26533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store4x8(const uint8x8x4_t v,
26633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                 uint8_t* const dst, int stride) {
26733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(dst + 0 * stride, v, 0);
26833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(dst + 1 * stride, v, 1);
26933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(dst + 2 * stride, v, 2);
27033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(dst + 3 * stride, v, 3);
27133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(dst + 4 * stride, v, 4);
27233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(dst + 5 * stride, v, 5);
27333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(dst + 6 * stride, v, 6);
27433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(dst + 7 * stride, v, 7);
27533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
27633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
27733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
27833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  const uint8x16_t q0, const uint8x16_t q1,
27933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  uint8_t* const dst, int stride) {
28033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x8x4_t lo, hi;
28133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(lo,
28233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vget_low_u8(p1), vget_low_u8(p0),
28333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vget_low_u8(q0), vget_low_u8(q1));
28433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(hi,
28533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vget_high_u8(p1), vget_high_u8(p0),
28633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vget_high_u8(q0), vget_high_u8(q1));
28733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store4x8(lo, dst - 2 + 0 * stride, stride);
28833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store4x8(hi, dst - 2 + 8 * stride, stride);
28933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
29033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // !WORK_AROUND_GCC
29133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
29233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
29333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  uint8_t* const dst, int stride) {
29433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1q_u8(dst - stride, p0);
29533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1q_u8(dst, q0);
29633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
29733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
29833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
29933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  const uint8x16_t q0, const uint8x16_t q1,
30033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                  uint8_t* const dst, int stride) {
30133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store16x2(p1, p0, dst - stride, stride);
30233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store16x2(q0, q1, dst + stride, stride);
30333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
30433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
30533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
30633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   uint8_t* const u, uint8_t* const v,
30733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   int stride) {
30833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // p0 and q0 contain the u+v samples packed in low/high halves.
30933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_u8(u - stride, vget_low_u8(p0));
31033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_u8(u,          vget_low_u8(q0));
31133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_u8(v - stride, vget_high_u8(p0));
31233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_u8(v,          vget_high_u8(q0));
31333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
31433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
31533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
31633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   const uint8x16_t q0, const uint8x16_t q1,
31733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   uint8_t* const u, uint8_t* const v,
31833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   int stride) {
31933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // The p1...q1 registers contain the u+v samples packed in low/high halves.
32033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store8x2x2(p1, p0, u - stride, v - stride, stride);
32133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store8x2x2(q0, q1, u + stride, v + stride, stride);
32233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
32333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
32433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if !defined(WORK_AROUND_GCC)
32533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
32633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
32733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
32833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
32933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  (DST) += stride;                                \
33033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} while (0)
33133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
33233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
33333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   const uint8x16_t p0, const uint8x16_t q0,
33433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   const uint8x16_t q1, const uint8x16_t q2,
33533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   uint8_t* u, uint8_t* v,
33633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   int stride) {
33733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x8x3_t u0, u1, v0, v1;
33833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
33933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
34033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
34133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
34233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(u, u0, u1, 0);
34333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(u, u0, u1, 1);
34433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(u, u0, u1, 2);
34533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(u, u0, u1, 3);
34633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(u, u0, u1, 4);
34733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(u, u0, u1, 5);
34833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(u, u0, u1, 6);
34933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(u, u0, u1, 7);
35033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(v, v0, v1, 0);
35133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(v, v0, v1, 1);
35233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(v, v0, v1, 2);
35333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(v, v0, v1, 3);
35433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(v, v0, v1, 4);
35533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(v, v0, v1, 5);
35633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(v, v0, v1, 6);
35733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  STORE6_LANE(v, v0, v1, 7);
35833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
35933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef STORE6_LANE
36033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
36133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
36233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   const uint8x16_t q0, const uint8x16_t q1,
36333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   uint8_t* const u, uint8_t* const v,
36433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                   int stride) {
36533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x8x4_t u0, v0;
36633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(u0,
36733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vget_low_u8(p1), vget_low_u8(p0),
36833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vget_low_u8(q0), vget_low_u8(q1));
36933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR4(v0,
37033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vget_high_u8(p1), vget_high_u8(p0),
37133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora               vget_high_u8(q0), vget_high_u8(q1));
37233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
37333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
37433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
37533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
37633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
37733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
37833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
37933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
38033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
38133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
38233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
38333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
38433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
38533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
38633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
38733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
38833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
38933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
39033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // !WORK_AROUND_GCC
39133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
39233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
39333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
39433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
39533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
39633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
39733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
39833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// to the corresponding rows of 'dst'.
39933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
40033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                            const int16x8_t dst01,
40133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                            const int16x8_t dst23) {
40233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Unsigned saturate to 8b.
40333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
40433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
40533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
40633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Store the results.
40733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
40833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
40933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
41033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
41133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
41233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
41333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
41433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                               uint8_t* const dst) {
41533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x2_t dst01 = vdup_n_u32(0);
41633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32x2_t dst23 = vdup_n_u32(0);
41733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
41833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // Load the source pixels.
41933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
42033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
42133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
42233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
42333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
42433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
42533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Convert to 16b.
42633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
42733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
42833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
42933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Descale with rounding.
43033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
43133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
43233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Add the inverse transform.
43333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    SaturateAndStore4x4(dst, out01, out23);
43433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
43533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
43633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
43733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//-----------------------------------------------------------------------------
43833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Simple In-loop filtering (Paragraph 15.2)
43933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
44033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
44133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                              const uint8x16_t q0, const uint8x16_t q1,
44233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                              int thresh) {
44333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
44433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
44533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
44633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
44733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
44833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
44933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
45033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return mask;
45133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
45233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
45333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int8x16_t FlipSign(const uint8x16_t v) {
45433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t sign_bit = vdupq_n_u8(0x80);
45533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
45633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
45733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
45833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic uint8x16_t FlipSignBack(const int8x16_t v) {
45933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t sign_bit = vdupq_n_s8(0x80);
46033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
46133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
46233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
46333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
46433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                              const int8x16_t q0, const int8x16_t q1) {
46533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
46633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
46733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
46833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
46933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
47033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return s3;
47133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
47233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
47333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
47433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
47533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
47633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
47733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return s2;
47833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
47933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
48033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
48133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
48233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
48333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                         const int8x16_t delta,
48433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                         uint8x16_t* const op0, uint8x16_t* const oq0) {
48533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t kCst3 = vdupq_n_s8(0x03);
48633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t kCst4 = vdupq_n_s8(0x04);
48733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
48833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
48933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
49033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
49133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
49233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
49333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *op0 = FlipSignBack(sp0);
49433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *oq0 = FlipSignBack(sq0);
49533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
49633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
49733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(USE_INTRINSICS)
49833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
49933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
50033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      const uint8x16_t q0, const uint8x16_t q1,
50133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      const uint8x16_t mask,
50233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      uint8x16_t* const op0, uint8x16_t* const oq0) {
50333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t p1s = FlipSign(p1);
50433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t p0s = FlipSign(p0);
50533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t q0s = FlipSign(q0);
50633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t q1s = FlipSign(q1);
50733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
50833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
50933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  ApplyFilter2(p0s, q0s, delta1, op0, oq0);
51033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
51133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
51233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
51333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p1, p0, q0, q1, op0, oq0;
51433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load16x4(p, stride, &p1, &p0, &q0, &q1);
51533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
51633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
51733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
51833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
51933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store16x2(op0, oq0, p, stride);
52033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
52133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
52233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
52333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p1, p0, q0, q1, oq0, op0;
52433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load4x16(p, stride, &p1, &p0, &q0, &q1);
52533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
52633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
52733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
52833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
52933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Store2x16(op0, oq0, p, stride);
53033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
53133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
53233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else
53333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
5348b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora#define QRegs "q0", "q1", "q2", "q3",                                          \
535a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
536a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
537a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#define FLIP_SIGN_BIT2(a, b, s)                                                \
538a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "veor     " #a "," #a "," #s "               \n"                             \
539a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "veor     " #b "," #b "," #s "               \n"                             \
540a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
541a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
542a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  FLIP_SIGN_BIT2(a, b, s)                                                      \
543a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  FLIP_SIGN_BIT2(c, d, s)                                                      \
544a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
545a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
546a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
547a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
548a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
549a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
550a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
551a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vdup.8     q14, " #thresh "            \n"                                  \
552a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
553a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
554a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
555a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
556a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
557a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
558a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
559a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
560a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
561a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
562a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vmov.i8    q15, #0x03                  \n"                                  \
563a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
564a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
565a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
566a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora                                                                               \
567a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vmov.i8    q15, #0x04                  \n"                                  \
568a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
569a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
570a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
571a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
572a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// Applies filter on 2 pixels (p0 and q0)
573a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
574a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
575a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
576a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
577a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
578a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
579a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
580a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  FLIP_SIGN_BIT2(p0, q0, q10)
581a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
58233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
583a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  __asm__ volatile (
584a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
585a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
586a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
587a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
588a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
5898b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
590a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
5918b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    DO_FILTER2(q1, q2, q3, q12, %[thresh])
592a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
593a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
594a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
595a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
596a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
597a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : [p] "+r"(p)
598a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : [stride] "r"(stride), [thresh] "r"(thresh)
599a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : "memory", QRegs
600a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  );
601a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora}
602a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
60333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
604a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  __asm__ volatile (
605a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
606a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
607a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
608a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
609a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
6108b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
6118b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
6128b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
6138b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
614a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
6158b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    DO_FILTER2(q1, q2, q12, q13, %[thresh])
616a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
617a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "sub        %[p], %[p], #1                 \n"  // p - 1
618a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
6198b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    "vswp        d5, d24                       \n"
620a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    STORE8x2(d4, d5, [%[p]], %[stride])
6218b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    STORE8x2(d24, d25, [%[p]], %[stride])
622a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
623a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : [p] "+r"(p)
624a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : [stride] "r"(stride), [thresh] "r"(thresh)
625a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : "memory", "r4", "r5", "r6", QRegs
626a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  );
627a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora}
628a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
62933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif    // USE_INTRINSICS
63033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
63133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
63233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32_t k;
63333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (k = 3; k != 0; --k) {
63433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    p += 4 * stride;
63533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    SimpleVFilter16(p, stride, thresh);
63633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
63733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
63833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
63933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
64033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32_t k;
64133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (k = 3; k != 0; --k) {
64233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    p += 4;
64333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    SimpleHFilter16(p, stride, thresh);
64433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
64533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
64633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
64733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
64833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Complex In-loop filtering (Paragraph 15.3)
64933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
65033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
65133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                           const uint8x16_t q0, const uint8x16_t q1,
65233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                           int hev_thresh) {
65333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
65433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
65533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
65633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
65733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
65833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t mask = vorrq_u8(mask1, mask2);
65933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return mask;
66033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
66133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
66233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
66333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                               const uint8x16_t p1, const uint8x16_t p0,
66433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                               const uint8x16_t q0, const uint8x16_t q1,
66533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                               const uint8x16_t q2, const uint8x16_t q3,
66633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                               int ithresh, int thresh) {
66733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
66833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
66933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
67033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
67133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
67233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
67333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
67433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
67533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
67633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
67733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t max12 = vmaxq_u8(max1, max2);
67833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t max123 = vmaxq_u8(max12, max3);
67933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
68033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
68133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t mask = vandq_u8(mask1, mask2);
68233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return mask;
68333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
68433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
68533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//  4-points filter
68633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
68733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ApplyFilter4(
68833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t p1, const int8x16_t p0,
68933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t q0, const int8x16_t q1,
69033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t delta0,
69133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t* const op1, uint8x16_t* const op0,
69233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t* const oq0, uint8x16_t* const oq1) {
69333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t kCst3 = vdupq_n_s8(0x03);
69433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t kCst4 = vdupq_n_s8(0x04);
69533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
69633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
69733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t a1 = vshrq_n_s8(delta1, 3);
69833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t a2 = vshrq_n_s8(delta2, 3);
69933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
70033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *op0 = FlipSignBack(vqaddq_s8(p0, a2));  // clip(p0 + a2)
70133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - a1)
70233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *op1 = FlipSignBack(vqaddq_s8(p1, a3));  // clip(p1 + a3)
70333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *oq1 = FlipSignBack(vqsubq_s8(q1, a3));  // clip(q1 - a3)
70433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
70533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
70633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void DoFilter4(
70733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t p1, const uint8x16_t p0,
70833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t q0, const uint8x16_t q1,
70933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask, const uint8x16_t hev_mask,
71033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t* const op1, uint8x16_t* const op0,
71133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t* const oq0, uint8x16_t* const oq1) {
71233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
71333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t p1s = FlipSign(p1);
71433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int8x16_t p0s = FlipSign(p0);
71533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int8x16_t q0s = FlipSign(q0);
71633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t q1s = FlipSign(q1);
71733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
71833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
71933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // do_filter2 part (simple loopfilter on pixels with hev)
72033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
72133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
72233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t simple_lf_delta =
72333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
72433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t tmp_p0, tmp_q0;
72533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
72633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
72733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    p0s = FlipSign(tmp_p0);
72833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    q0s = FlipSign(tmp_q0);
72933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
73033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
73133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // do_filter4 part (complex loopfilter on pixels without hev)
73233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
73333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
73433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
73533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
73633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t complex_lf_delta =
73733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
73833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
73933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
74033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
74133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
74233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//  6-points filter
74333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
74433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ApplyFilter6(
74533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
74633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
74733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t delta,
74833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
74933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
75033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t kCst63 = vdupq_n_s16(63);
75133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t kCst27 = vdup_n_s8(27);
75233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t kCst18 = vdup_n_s8(18);
75333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t kCst9 = vdup_n_s8(9);
75433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t delta_lo = vget_low_s8(delta);
75533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t delta_hi = vget_high_s8(delta);
75633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo);  // 63 + 27 * a
75733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi);  // 63 + 27 * a
75833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo);  // 63 + 18 * a
75933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi);  // 63 + 18 * a
76033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo);   // 63 + 9 * a
76133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi);   // 63 + 9 * a
76233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
76333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
76433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
76533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
76633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
76733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
76833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
76933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
77033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
77133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
77233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *op0 = FlipSignBack(vqaddq_s8(p0, a1));  // clip(p0 + a1)
77333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - q1)
77433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *oq1 = FlipSignBack(vqsubq_s8(q1, a2));  // clip(q1 - a2)
77533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *op1 = FlipSignBack(vqaddq_s8(p1, a2));  // clip(p1 + a2)
77633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *oq2 = FlipSignBack(vqsubq_s8(q2, a3));  // clip(q2 - a3)
77733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *op2 = FlipSignBack(vqaddq_s8(p2, a3));  // clip(p2 + a3)
77833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
77933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
78033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void DoFilter6(
78133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
78233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
78333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask, const uint8x16_t hev_mask,
78433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
78533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
78633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
78733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t p2s = FlipSign(p2);
78833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t p1s = FlipSign(p1);
78933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int8x16_t p0s = FlipSign(p0);
79033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int8x16_t q0s = FlipSign(q0);
79133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t q1s = FlipSign(q1);
79233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t q2s = FlipSign(q2);
79333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
79433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
79533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
79633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // do_filter2 part (simple loopfilter on pixels with hev)
79733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
79833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t simple_lf_delta =
79933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
80033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t tmp_p0, tmp_q0;
80133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
80233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
80333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    p0s = FlipSign(tmp_p0);
80433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    q0s = FlipSign(tmp_q0);
80533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
80633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
80733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // do_filter6 part (complex loopfilter on pixels without hev)
80833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
80933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
81033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
81133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int8x16_t complex_lf_delta =
81233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
81333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
81433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                 op2, op1, op0, oq0, oq1, oq2);
81533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
81633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
81733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
81833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// on macroblock edges
81933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
82033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void VFilter16(uint8_t* p, int stride,
82133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      int thresh, int ithresh, int hev_thresh) {
82233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
82333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
82433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
82533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
82633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         ithresh, thresh);
82733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
82833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
82933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
83033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora              &op2, &op1, &op0, &oq0, &oq1, &oq2);
83133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store16x2(op2, op1, p - 2 * stride, stride);
83233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store16x2(op0, oq0, p + 0 * stride, stride);
83333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store16x2(oq1, oq2, p + 2 * stride, stride);
83433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
83533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
83633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
83733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void HFilter16(uint8_t* p, int stride,
83833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      int thresh, int ithresh, int hev_thresh) {
83933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
84033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
84133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
84233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
84333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         ithresh, thresh);
84433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
84533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
84633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
84733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora              &op2, &op1, &op0, &oq0, &oq1, &oq2);
84833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store2x16(op2, op1, p - 2, stride);
84933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store2x16(op0, oq0, p + 0, stride);
85033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store2x16(oq1, oq2, p + 2, stride);
85133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
85233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
85333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
85433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// on three inner edges
85533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void VFilter16i(uint8_t* p, int stride,
85633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                       int thresh, int ithresh, int hev_thresh) {
85733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32_t k;
85833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p3, p2, p1, p0;
85933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load16x4(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
86033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (k = 3; k != 0; --k) {
86133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t q0, q1, q2, q3;
862a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    p += 4 * stride;
86333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Load16x4(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
86433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    {
86533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint8x16_t mask =
86633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
86733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
86833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      // p3 and p2 are not just temporary variables here: they will be
86933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      // re-used for next span. And q2/q3 will become p1/p0 accordingly.
87033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
87133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      Store16x4(p1, p0, p3, p2, p, stride);
87233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      p1 = q2;
87333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      p0 = q3;
87433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    }
875a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  }
876a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora}
877a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
87833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if !defined(WORK_AROUND_GCC)
87933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void HFilter16i(uint8_t* p, int stride,
88033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                       int thresh, int ithresh, int hev_thresh) {
88133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint32_t k;
88233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p3, p2, p1, p0;
88333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
88433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  for (k = 3; k != 0; --k) {
88533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t q0, q1, q2, q3;
886a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    p += 4;
88733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
88833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    {
88933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint8x16_t mask =
89033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
89133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
89233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
89333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      Store4x16(p1, p0, p3, p2, p, stride);
89433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      p1 = q2;
89533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      p0 = q3;
89633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    }
89733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
89833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
89933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // !WORK_AROUND_GCC
90033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
90133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// 8-pixels wide variant, for chroma filtering
90233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void VFilter8(uint8_t* u, uint8_t* v, int stride,
90333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                     int thresh, int ithresh, int hev_thresh) {
90433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
90533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
90633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
90733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
90833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         ithresh, thresh);
90933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
91033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
91133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
91233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora              &op2, &op1, &op0, &oq0, &oq1, &oq2);
91333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
91433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
91533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
91633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
91733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
91833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void VFilter8i(uint8_t* u, uint8_t* v, int stride,
91933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      int thresh, int ithresh, int hev_thresh) {
92033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
92133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  u += 4 * stride;
92233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  v += 4 * stride;
92333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
92433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
92533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
92633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         ithresh, thresh);
92733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
92833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t op1, op0, oq0, oq1;
92933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
93033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
93133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
93233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
93333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
93433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if !defined(WORK_AROUND_GCC)
93533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void HFilter8(uint8_t* u, uint8_t* v, int stride,
93633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                     int thresh, int ithresh, int hev_thresh) {
93733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
93833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
93933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
94033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
94133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         ithresh, thresh);
94233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
94333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
94433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
94533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora              &op2, &op1, &op0, &oq0, &oq1, &oq2);
94633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
94733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
94833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
94933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
95033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void HFilter8i(uint8_t* u, uint8_t* v, int stride,
95133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      int thresh, int ithresh, int hev_thresh) {
95233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
95333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  u += 4;
95433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  v += 4;
95533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
95633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
95733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
95833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                         ithresh, thresh);
95933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
96033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    uint8x16_t op1, op0, oq0, oq1;
96133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
96233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
963a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  }
964a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora}
96533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif  // !WORK_AROUND_GCC
966a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
9671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//-----------------------------------------------------------------------------
9681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Inverse transforms (Paragraph 14.4)
9691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
97033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Technically these are unsigned but vqdmulh is only available in signed.
97133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// vqdmulh returns high half (effectively >> 16) but also doubles the value,
97233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// changing the >> 16 to >> 15 and requiring an additional >> 1.
97333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// We use this to our advantage with kC2. The canonical value is 35468.
97433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// However, the high bit is set so treating it as signed will give incorrect
97533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// results. We avoid this by down shifting by 1 here to clear the highest bit.
97633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Combined with the doubling effect of vqdmulh we get >> 16.
97733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This can not be applied to kC1 because the lowest bit is set. Down shifting
97833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// the constant would reduce precision.
97933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
98033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// libwebp uses a trick to avoid some extra addition that libvpx does.
98133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Instead of:
98233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
98333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
98433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// same issue with kC1 and vqdmulh that we work around by down shifting kC2
98533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
98633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const int16_t kC1 = 20091;
98733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
98833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
98933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(USE_INTRINSICS)
99033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
99133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                     int16x8x2_t* const out) {
99233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
99333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
99433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
99533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                                  // b0 d0 b1 d1 b2 d2 ...
99633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
99733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
99833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
99933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
100033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // {rows} = in0 | in4
100133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  //          in8 | in12
100233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // B1 = in4 | in12
100333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t B1 =
100433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
100533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // C0 = kC1 * in4 | kC1 * in12
100633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // C1 = kC2 * in4 | kC2 * in12
100733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
100833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
100933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
101033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                vget_low_s16(rows->val[1]));   // in0 + in8
101133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
101233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                vget_low_s16(rows->val[1]));   // in0 - in8
101333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // c = kC2 * in4 - kC1 * in12
101433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // d = kC1 * in4 + kC2 * in12
101533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
101633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
101733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
101833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
101933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
102033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
102133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
102233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Transpose8x2(E0, E1, rows);
102333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
102433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
10258b720228d581a84fd173b6dcb2fa295b59db489aVikas Arorastatic void TransformOne(const int16_t* in, uint8_t* dst) {
102633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int16x8x2_t rows;
102733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
102833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  TransformPass(&rows);
102933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  TransformPass(&rows);
103033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Add4x4(rows.val[0], rows.val[1], dst);
103133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
103233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
103333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else
1034a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
103533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void TransformOne(const int16_t* in, uint8_t* dst) {
103633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int kBPS = BPS;
103733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // kC1, kC2. Padded because vld1.16 loads 8 bytes
103833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16_t constants[4] = { kC1, kC2, 0, 0 };
1039a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1040a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  __asm__ volatile (
1041a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.16         {q1, q2}, [%[in]]           \n"
1042a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.16         {d0}, [%[constants]]        \n"
1043a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1044a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* d2: in[0]
1045a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d3: in[8]
1046a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d4: in[4]
1047a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d5: in[12]
1048a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1049a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vswp            d3, d4                      \n"
1050a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1051a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1052a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * q9 = {in[4], in[12]} * kC2 >> 16
1053a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1054a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1055a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1056a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1057a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* d22 = a = in[0] + in[8]
1058a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d23 = b = in[0] - in[8]
1059a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1060a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
1061a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
1062a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1063a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* The multiplication should be x * kC1 >> 16
1064a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * However, with vqdmulh we get x * kC1 * 2 >> 16
1065a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * (multiply, double, return high half)
1066a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * We avoided this in kC2 by pre-shifting the constant.
1067a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * q8 = in[4]/[12] * kC1 >> 16
1068a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1069a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
1070a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1071a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* Add {in[4], in[12]} back after the multiplication. This is handled by
1072a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * adding 1 << 16 to kC1 in the libwebp C code.
1073a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1074a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
1075a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1076a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* d20 = c = in[4]*kC2 - in[12]*kC1
1077a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d21 = d = in[4]*kC1 + in[12]*kC2
1078a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1079a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
1080a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
1081a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1082a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* d2 = tmp[0] = a + d
1083a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d3 = tmp[1] = b + c
1084a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d4 = tmp[2] = b - c
1085a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d5 = tmp[3] = a - d
1086a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1087a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
1088a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
1089a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
1090a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
1091a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1092a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vzip.16         q1, q2                      \n"
1093a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vzip.16         q1, q2                      \n"
1094a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1095a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vswp            d3, d4                      \n"
1096a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1097a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1098a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1099a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1100a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqdmulh.s16     q8, q2, d0[0]               \n"
1101a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqdmulh.s16     q9, q2, d0[1]               \n"
1102a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1103a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* d22 = a = tmp[0] + tmp[8]
1104a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d23 = b = tmp[0] - tmp[8]
1105a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1106a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       d22, d2, d3                 \n"
1107a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqsub.s16       d23, d2, d3                 \n"
1108a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1109a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* See long winded explanations prior */
1110a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vshr.s16        q8, q8, #1                  \n"
1111a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       q8, q2, q8                  \n"
1112a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1113a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* d20 = c = in[4]*kC2 - in[12]*kC1
1114a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d21 = d = in[4]*kC1 + in[12]*kC2
1115a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1116a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqsub.s16       d20, d18, d17               \n"
1117a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       d21, d19, d16               \n"
1118a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1119a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* d2 = tmp[0] = a + d
1120a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d3 = tmp[1] = b + c
1121a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d4 = tmp[2] = b - c
1122a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     * d5 = tmp[3] = a - d
1123a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora     */
1124a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       d2, d22, d21                \n"
1125a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       d3, d23, d20                \n"
1126a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqsub.s16       d4, d23, d20                \n"
1127a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqsub.s16       d5, d22, d21                \n"
1128a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1129a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1130a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1131a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1132a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1133a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1134a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1135a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1136a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* (val) + 4 >> 3 */
1137a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vrshr.s16       d2, d2, #3                  \n"
1138a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vrshr.s16       d3, d3, #3                  \n"
1139a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vrshr.s16       d4, d4, #3                  \n"
1140a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vrshr.s16       d5, d5, #3                  \n"
1141a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1142a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vzip.16         q1, q2                      \n"
1143a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vzip.16         q1, q2                      \n"
1144a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1145a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    /* Must accumulate before saturating */
1146a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vmovl.u8        q8, d6                      \n"
1147a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vmovl.u8        q9, d7                      \n"
1148a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1149a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       q1, q1, q8                  \n"
1150a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqadd.s16       q2, q2, q9                  \n"
1151a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1152a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqmovun.s16     d0, q1                      \n"
1153a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vqmovun.s16     d1, q2                      \n"
1154a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1155a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1156a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1157a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1158a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    "vst1.32         d1[1], [%[dst]]             \n"
1159a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1160a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1161a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1162a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1163a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  );
1164a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora}
1165a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
116633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif    // USE_INTRINSICS
116733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
11688b720228d581a84fd173b6dcb2fa295b59db489aVikas Arorastatic void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
11698b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora  TransformOne(in, dst);
1170a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  if (do_two) {
11718b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora    TransformOne(in + 16, dst + 4);
1172a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora  }
1173a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora}
1174a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
11758b720228d581a84fd173b6dcb2fa295b59db489aVikas Arorastatic void TransformDC(const int16_t* in, uint8_t* dst) {
117633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t DC = vdupq_n_s16(in[0]);
117733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Add4x4(DC, DC, dst);
117833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
11798b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora
118033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
11818b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora
118233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define STORE_WHT(dst, col, rows) do {                  \
118333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
118433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
118533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
118633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
118733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} while (0)
11888b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora
118933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void TransformWHT(const int16_t* in, int16_t* out) {
119033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  int32x4x4_t tmp;
119133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
119233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
119333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Load the source.
119433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t in00_03 = vld1_s16(in + 0);
119533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t in04_07 = vld1_s16(in + 4);
119633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t in08_11 = vld1_s16(in + 8);
119733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int16x4_t in12_15 = vld1_s16(in + 12);
119833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
119933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
120033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
120133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
120233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[0] = vaddq_s32(a0, a1);
120333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[1] = vaddq_s32(a3, a2);
120433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[2] = vsubq_s32(a0, a1);
120533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[3] = vsubq_s32(a3, a2);
120633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // Arrange the temporary results column-wise.
120733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp = Transpose4x4(tmp);
120833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
12098b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora
121033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  {
121133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t kCst3 = vdupq_n_s32(3);
121233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
121333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
121433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
121533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
121633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
121733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
121833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[0] = vaddq_s32(a0, a1);
121933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[1] = vaddq_s32(a3, a2);
122033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[2] = vsubq_s32(a0, a1);
122133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[3] = vsubq_s32(a3, a2);
122233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
122333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // right shift the results by 3.
122433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
122533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
122633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
122733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
122833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
122933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    STORE_WHT(out, 0, tmp);
123033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    STORE_WHT(out, 1, tmp);
123133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    STORE_WHT(out, 2, tmp);
123233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    STORE_WHT(out, 3, tmp);
123333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
12348b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora}
12358b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora
123633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef STORE_WHT
123733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
123833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------
123933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
124033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define MUL(a, b) (((a) * (b)) >> 16)
124133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void TransformAC3(const int16_t* in, uint8_t* dst) {
124233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  static const int kC1_full = 20091 + (1 << 16);
124333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  static const int kC2_full = 35468;
124433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t A = vdup_n_s16(in[0]);
124533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
124633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
124733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int c1 = MUL(in[1], kC2_full);
124833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int d1 = MUL(in[1], kC1_full);
124933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
125033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      (uint64_t)( c1 & 0xffff) << 16 |
125133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      (uint64_t)(-c1 & 0xffff) << 32 |
125233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      (uint64_t)(-d1 & 0xffff) << 48;
125333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t CD = vcreate_s16(cd);
125433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x4_t B = vqadd_s16(A, CD);
125533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
125633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
125733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  Add4x4(m0_m1, m2_m3, dst);
12581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
125933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef MUL
12601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
12611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif   // WEBP_USE_NEON
12621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
12631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
12641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Entry point
12651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1266a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Aroraextern void VP8DspInitNEON(void);
1267a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
1268a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Aroravoid VP8DspInitNEON(void) {
12691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON)
12708b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora  VP8Transform = TransformTwo;
127133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8TransformAC3 = TransformAC3;
12728b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora  VP8TransformDC = TransformDC;
12731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  VP8TransformWHT = TransformWHT;
1274a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
127533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8VFilter16 = VFilter16;
127633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8VFilter16i = VFilter16i;
127733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8HFilter16 = HFilter16;
127833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if !defined(WORK_AROUND_GCC)
127933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8HFilter16i = HFilter16i;
128033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif
128133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8VFilter8 = VFilter8;
128233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8VFilter8i = VFilter8i;
128333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if !defined(WORK_AROUND_GCC)
128433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8HFilter8 = HFilter8;
128533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8HFilter8i = HFilter8i;
128633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif
128733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SimpleVFilter16 = SimpleVFilter16;
128833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SimpleHFilter16 = SimpleHFilter16;
128933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SimpleVFilter16i = SimpleVFilter16i;
129033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SimpleHFilter16i = SimpleHFilter16i;
12911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif   // WEBP_USE_NEON
1292a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora}
1293