15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2012 Google Inc. All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
3eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// Use of this source code is governed by a BSD-style license
4eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// that can be found in the COPYING file in the root of the source
5eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// tree. An additional intellectual property rights grant can be found
6eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// in the file PATENTS. All contributing project authors may
7eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// be found in the AUTHORS file in the root of the source tree.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// -----------------------------------------------------------------------------
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ARM NEON version of dsp functions and loop filtering.
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Authors: Somnath Banerjee (somnath@google.com)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          Johann Koenig (johannkoenig@google.com)
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "./dsp.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON)
182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#include "./neon.h"
202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "../dec/vp8i.h"
212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------
235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// NxM Loading functions
245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Load/Store vertical edge
265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define STORE8x2(c1, c2, p, stride)                                            \
375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC)
475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// (register alloc, probably). The variants somewhat mitigate the problem, but
505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// not quite. HFilter16i() remains problematic.
515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x8_t zero = vdup_n_u8(0);
535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x8x4_t out;
545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR4(out, zero, zero, zero, zero);
555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  out = vld4_lane_u8(src + 0 * stride, out, 0);
565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  out = vld4_lane_u8(src + 1 * stride, out, 1);
575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  out = vld4_lane_u8(src + 2 * stride, out, 2);
585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  out = vld4_lane_u8(src + 3 * stride, out, 3);
595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  out = vld4_lane_u8(src + 4 * stride, out, 4);
605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  out = vld4_lane_u8(src + 5 * stride, out, 5);
615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  out = vld4_lane_u8(src + 6 * stride, out, 6);
625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  out = vld4_lane_u8(src + 7 * stride, out, 7);
635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return out;
645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const p1, uint8x16_t* const p0,
685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const q0, uint8x16_t* const q1) {
695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p1 = vcombine_u8(row0.val[0], row8.val[0]);
745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p0 = vcombine_u8(row0.val[1], row8.val[1]);
755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q0 = vcombine_u8(row0.val[2], row8.val[2]);
765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q1 = vcombine_u8(row0.val[3], row8.val[3]);
775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#else  // WORK_AROUND_GCC
805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define LOADQ_LANE_32b(VALUE, LANE) do {                             \
825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  src += stride;                                                     \
845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} while (0)
855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const p1, uint8x16_t* const p0,
885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const q0, uint8x16_t* const q1) {
895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint32x4_t zero = vdupq_n_u32(0);
905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint32x4x4_t in;
915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR4(in, zero, zero, zero, zero);
925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  src -= 2;
935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[0], 0);
945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[1], 0);
955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[2], 0);
965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[3], 0);
975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[0], 1);
985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[1], 1);
995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[2], 1);
1005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[3], 1);
1015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[0], 2);
1025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[1], 2);
1035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[2], 2);
1045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[3], 2);
1055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[0], 3);
1065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[1], 3);
1075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[2], 3);
1085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  LOADQ_LANE_32b(in.val[3], 3);
1095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // Transpose four 4x4 parts:
1105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
1115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
1125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                        vreinterpretq_u8_u32(in.val[1]));
1135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
1145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                        vreinterpretq_u8_u32(in.val[3]));
1155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
1165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                         vreinterpretq_u16_u8(row23.val[0]));
1175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
1185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                         vreinterpretq_u16_u8(row23.val[1]));
1195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    *p1 = vreinterpretq_u8_u16(row02.val[0]);
1205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    *p0 = vreinterpretq_u8_u16(row13.val[0]);
1215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    *q0 = vreinterpretq_u8_u16(row02.val[1]);
1225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    *q1 = vreinterpretq_u8_u16(row13.val[1]);
1235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
1245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
1255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef LOADQ_LANE_32b
1265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
1275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif  // !WORK_AROUND_GCC
1285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
1295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
1305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const p3, uint8x16_t* const p2,
1315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const p1, uint8x16_t* const p0,
1325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const q0, uint8x16_t* const q1,
1335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const q2, uint8x16_t* const q3) {
1345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load4x16(src - 2, stride, p3, p2, p1, p0);
1355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load4x16(src + 2, stride, q0, q1, q2, q3);
1365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
1375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
1385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
1395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const p1, uint8x16_t* const p0,
1405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const q0, uint8x16_t* const q1) {
1415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p1 = vld1q_u8(src - 2 * stride);
1425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p0 = vld1q_u8(src - 1 * stride);
1435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q0 = vld1q_u8(src + 0 * stride);
1445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q1 = vld1q_u8(src + 1 * stride);
1455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
1465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
1475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
1485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const p3, uint8x16_t* const p2,
1495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const p1, uint8x16_t* const p0,
1505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const q0, uint8x16_t* const q1,
1515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8x16_t* const q2, uint8x16_t* const q3) {
1525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load16x4(src - 2  * stride, stride, p3, p2, p1, p0);
1535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
1545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
1555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
1565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
1575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  const uint8_t* const v,
1585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  int stride,
1595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  uint8x16_t* const p3, uint8x16_t* const p2,
1605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  uint8x16_t* const p1, uint8x16_t* const p0,
1615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  uint8x16_t* const q0, uint8x16_t* const q1,
1625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  uint8x16_t* const q2, uint8x16_t* const q3) {
1635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
1645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // and the v-samples on the higher half.
1655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
1665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
1675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
1685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
1695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
1705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
1715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
1725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
1735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
1745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
1755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC)
1765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
1775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define LOAD_UV_8(ROW) \
1785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
1795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
1805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
1815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   const uint8_t* const v,
1825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   int stride,
1835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   uint8x16_t* const p3, uint8x16_t* const p2,
1845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   uint8x16_t* const p1, uint8x16_t* const p0,
1855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   uint8x16_t* const q0, uint8x16_t* const q1,
1865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   uint8x16_t* const q2, uint8x16_t* const q3) {
1875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
1885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // and the v-samples on the higher half.
1895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t row0 = LOAD_UV_8(0);
1905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t row1 = LOAD_UV_8(1);
1915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t row2 = LOAD_UV_8(2);
1925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t row3 = LOAD_UV_8(3);
1935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t row4 = LOAD_UV_8(4);
1945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t row5 = LOAD_UV_8(5);
1955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t row6 = LOAD_UV_8(6);
1965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t row7 = LOAD_UV_8(7);
1975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // Perform two side-by-side 8x8 transposes
1985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
1995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
2005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
2015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
2025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
2035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
2045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
2055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
2065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
2075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                                    // u01 u11 u03 u13 ...
2085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
2095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                                    // u21 u31 u23 u33 ...
2105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
2115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
2125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
2135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                       vreinterpretq_u16_u8(row23.val[0]));
2145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
2155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                       vreinterpretq_u16_u8(row23.val[1]));
2165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
2175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                       vreinterpretq_u16_u8(row67.val[0]));
2185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
2195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                       vreinterpretq_u16_u8(row67.val[1]));
2205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
2215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                       vreinterpretq_u32_u16(row46.val[0]));
2225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
2235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                       vreinterpretq_u32_u16(row46.val[1]));
2245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
2255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                       vreinterpretq_u32_u16(row57.val[0]));
2265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
2275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                       vreinterpretq_u32_u16(row57.val[1]));
2285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p3 = vreinterpretq_u8_u32(row04.val[0]);
2295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p2 = vreinterpretq_u8_u32(row15.val[0]);
2305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p1 = vreinterpretq_u8_u32(row26.val[0]);
2315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *p0 = vreinterpretq_u8_u32(row37.val[0]);
2325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q0 = vreinterpretq_u8_u32(row04.val[1]);
2335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q1 = vreinterpretq_u8_u32(row15.val[1]);
2345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q2 = vreinterpretq_u8_u32(row26.val[1]);
2355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *q3 = vreinterpretq_u8_u32(row37.val[1]);
2365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
2375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef LOAD_UV_8
2385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
2395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif  // !WORK_AROUND_GCC
2405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
2415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
2425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8_t* const dst, int stride) {
2435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst2_lane_u8(dst + 0 * stride, v, 0);
2445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst2_lane_u8(dst + 1 * stride, v, 1);
2455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst2_lane_u8(dst + 2 * stride, v, 2);
2465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst2_lane_u8(dst + 3 * stride, v, 3);
2475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst2_lane_u8(dst + 4 * stride, v, 4);
2485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst2_lane_u8(dst + 5 * stride, v, 5);
2495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst2_lane_u8(dst + 6 * stride, v, 6);
2505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst2_lane_u8(dst + 7 * stride, v, 7);
2515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
2525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
2535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
2545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  uint8_t* const dst, int stride) {
2555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x8x2_t lo, hi;
2565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  lo.val[0] = vget_low_u8(p0);
2575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  lo.val[1] = vget_low_u8(q0);
2585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  hi.val[0] = vget_high_u8(p0);
2595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  hi.val[1] = vget_high_u8(q0);
2605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store2x8(lo, dst - 1 + 0 * stride, stride);
2615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store2x8(hi, dst - 1 + 8 * stride, stride);
2625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
2635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
2645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC)
2655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
2665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                 uint8_t* const dst, int stride) {
2675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(dst + 0 * stride, v, 0);
2685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(dst + 1 * stride, v, 1);
2695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(dst + 2 * stride, v, 2);
2705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(dst + 3 * stride, v, 3);
2715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(dst + 4 * stride, v, 4);
2725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(dst + 5 * stride, v, 5);
2735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(dst + 6 * stride, v, 6);
2745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(dst + 7 * stride, v, 7);
2755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
2765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
2775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
2785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  const uint8x16_t q0, const uint8x16_t q1,
2795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  uint8_t* const dst, int stride) {
2805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x8x4_t lo, hi;
2815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR4(lo,
2825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)               vget_low_u8(p1), vget_low_u8(p0),
2835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)               vget_low_u8(q0), vget_low_u8(q1));
2845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR4(hi,
2855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)               vget_high_u8(p1), vget_high_u8(p0),
2865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)               vget_high_u8(q0), vget_high_u8(q1));
2875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store4x8(lo, dst - 2 + 0 * stride, stride);
2885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store4x8(hi, dst - 2 + 8 * stride, stride);
2895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
2905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif  // !WORK_AROUND_GCC
2915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
2925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
2935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  uint8_t* const dst, int stride) {
2945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1q_u8(dst - stride, p0);
2955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1q_u8(dst, q0);
2965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
2975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
2985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
2995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  const uint8x16_t q0, const uint8x16_t q1,
3005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                  uint8_t* const dst, int stride) {
3015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store16x2(p1, p0, dst - stride, stride);
3025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store16x2(q0, q1, dst + stride, stride);
3035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
3045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
3065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   uint8_t* const u, uint8_t* const v,
3075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   int stride) {
3085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // p0 and q0 contain the u+v samples packed in low/high halves.
3095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1_u8(u - stride, vget_low_u8(p0));
3105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1_u8(u,          vget_low_u8(q0));
3115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1_u8(v - stride, vget_high_u8(p0));
3125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1_u8(v,          vget_high_u8(q0));
3135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
3145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
3165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   const uint8x16_t q0, const uint8x16_t q1,
3175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   uint8_t* const u, uint8_t* const v,
3185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   int stride) {
3195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // The p1...q1 registers contain the u+v samples packed in low/high halves.
3205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store8x2x2(p1, p0, u - stride, v - stride, stride);
3215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store8x2x2(q0, q1, u + stride, v + stride, stride);
3225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
3235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC)
3255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
3275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
3285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
3295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  (DST) += stride;                                \
3305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} while (0)
3315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
3335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   const uint8x16_t p0, const uint8x16_t q0,
3345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   const uint8x16_t q1, const uint8x16_t q2,
3355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   uint8_t* u, uint8_t* v,
3365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   int stride) {
3375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x8x3_t u0, u1, v0, v1;
3385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
3395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
3405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
3415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
3425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(u, u0, u1, 0);
3435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(u, u0, u1, 1);
3445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(u, u0, u1, 2);
3455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(u, u0, u1, 3);
3465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(u, u0, u1, 4);
3475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(u, u0, u1, 5);
3485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(u, u0, u1, 6);
3495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(u, u0, u1, 7);
3505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(v, v0, v1, 0);
3515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(v, v0, v1, 1);
3525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(v, v0, v1, 2);
3535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(v, v0, v1, 3);
3545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(v, v0, v1, 4);
3555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(v, v0, v1, 5);
3565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(v, v0, v1, 6);
3575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  STORE6_LANE(v, v0, v1, 7);
3585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
3595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef STORE6_LANE
3605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
3625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   const uint8x16_t q0, const uint8x16_t q1,
3635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   uint8_t* const u, uint8_t* const v,
3645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                   int stride) {
3655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x8x4_t u0, v0;
3665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR4(u0,
3675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)               vget_low_u8(p1), vget_low_u8(p0),
3685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)               vget_low_u8(q0), vget_low_u8(q1));
3695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR4(v0,
3705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)               vget_high_u8(p1), vget_high_u8(p0),
3715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)               vget_high_u8(q0), vget_high_u8(q1));
3725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
3735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
3745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
3755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
3765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
3775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
3785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
3795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
3805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
3815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
3825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
3835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
3845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
3855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
3865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
3875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
3885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
3895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif  // !WORK_AROUND_GCC
3915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
3935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
3945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
3955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
3965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
3975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
3985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// to the corresponding rows of 'dst'.
3995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
4005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                            const int16x8_t dst01,
4015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                            const int16x8_t dst23) {
4025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // Unsigned saturate to 8b.
4035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
4045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
4055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // Store the results.
4075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
4085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
4095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
4105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
4115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
4125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
4145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                               uint8_t* const dst) {
4155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint32x2_t dst01 = vdup_n_u32(0);
4165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint32x2_t dst23 = vdup_n_u32(0);
4175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // Load the source pixels.
4195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
4205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
4215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
4225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
4235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
4255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // Convert to 16b.
4265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
4275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
4285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // Descale with rounding.
4305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
4315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
4325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // Add the inverse transform.
4335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    SaturateAndStore4x4(dst, out01, out23);
4345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
4355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
4365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//-----------------------------------------------------------------------------
4385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Simple In-loop filtering (Paragraph 15.2)
4395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
4415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                              const uint8x16_t q0, const uint8x16_t q1,
4425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                              int thresh) {
4435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
4445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
4455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
4465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
4475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
4485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
4495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
4505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return mask;
4515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
4525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static int8x16_t FlipSign(const uint8x16_t v) {
4545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t sign_bit = vdupq_n_u8(0x80);
4555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
4565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
4575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static uint8x16_t FlipSignBack(const int8x16_t v) {
4595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t sign_bit = vdupq_n_s8(0x80);
4605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
4615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
4625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
4645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                              const int8x16_t q0, const int8x16_t q1) {
4655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
4665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
4675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
4685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
4695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
4705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return s3;
4715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
4725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
4745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
4755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
4765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
4775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return s2;
4785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
4795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------
4815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
4835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                         const int8x16_t delta,
4845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                         uint8x16_t* const op0, uint8x16_t* const oq0) {
4855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t kCst3 = vdupq_n_s8(0x03);
4865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t kCst4 = vdupq_n_s8(0x04);
4875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
4885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
4895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
4905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
4915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
4925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
4935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *op0 = FlipSignBack(sp0);
4945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *oq0 = FlipSignBack(sq0);
4955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
4965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if defined(USE_INTRINSICS)
4985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
4995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
5005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      const uint8x16_t q0, const uint8x16_t q1,
5015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      const uint8x16_t mask,
5025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      uint8x16_t* const op0, uint8x16_t* const oq0) {
5035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t p1s = FlipSign(p1);
5045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t p0s = FlipSign(p0);
5055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t q0s = FlipSign(q0);
5065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t q1s = FlipSign(q1);
5075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
5085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
5095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  ApplyFilter2(p0s, q0s, delta1, op0, oq0);
5105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
5115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
5125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
5135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p1, p0, q0, q1, op0, oq0;
5145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load16x4(p, stride, &p1, &p0, &q0, &q1);
5155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
5165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
5175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
5185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
5195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store16x2(op0, oq0, p, stride);
5205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
5215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
5225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
5235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p1, p0, q0, q1, oq0, op0;
5245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load4x16(p, stride, &p1, &p0, &q0, &q1);
5255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
5265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
5275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
5285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
5295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Store2x16(op0, oq0, p, stride);
5305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
5315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
5325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#else
5335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
5345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define QRegs "q0", "q1", "q2", "q3",                                          \
5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FLIP_SIGN_BIT2(a, b, s)                                                \
5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "veor     " #a "," #a "," #s "               \n"                             \
5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "veor     " #b "," #b "," #s "               \n"                             \
5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLIP_SIGN_BIT2(a, b, s)                                                      \
5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLIP_SIGN_BIT2(c, d, s)                                                      \
5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vdup.8     q14, " #thresh "            \n"                                  \
5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vmov.i8    q15, #0x03                  \n"                                  \
5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                                               \
5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vmov.i8    q15, #0x04                  \n"                                  \
5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Applies filter on 2 pixels (p0 and q0)
5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLIP_SIGN_BIT2(p0, q0, q10)
5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  __asm__ volatile (
5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
5895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    DO_FILTER2(q1, q2, q3, q12, %[thresh])
5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [p] "+r"(p)
5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [stride] "r"(stride), [thresh] "r"(thresh)
5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : "memory", QRegs
6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  __asm__ volatile (
6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
6105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
6115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
6125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
6135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    DO_FILTER2(q1, q2, q12, q13, %[thresh])
6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub        %[p], %[p], #1                 \n"  // p - 1
6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    "vswp        d5, d24                       \n"
6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    STORE8x2(d4, d5, [%[p]], %[stride])
6215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    STORE8x2(d24, d25, [%[p]], %[stride])
6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [p] "+r"(p)
6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [stride] "r"(stride), [thresh] "r"(thresh)
6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : "memory", "r4", "r5", "r6", QRegs
6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif    // USE_INTRINSICS
6305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
6315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
6325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint32_t k;
6335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  for (k = 3; k != 0; --k) {
6345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    p += 4 * stride;
6355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    SimpleVFilter16(p, stride, thresh);
6365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
6375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
6385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
6395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
6405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint32_t k;
6415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  for (k = 3; k != 0; --k) {
6425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    p += 4;
6435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    SimpleHFilter16(p, stride, thresh);
6445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
6455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
6465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
6475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------
6485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Complex In-loop filtering (Paragraph 15.3)
6495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
6505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
6515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                           const uint8x16_t q0, const uint8x16_t q1,
6525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                           int hev_thresh) {
6535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
6545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
6555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
6565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
6575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
6585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t mask = vorrq_u8(mask1, mask2);
6595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return mask;
6605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
6615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
6625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
6635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                               const uint8x16_t p1, const uint8x16_t p0,
6645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                               const uint8x16_t q0, const uint8x16_t q1,
6655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                               const uint8x16_t q2, const uint8x16_t q3,
6665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                               int ithresh, int thresh) {
6675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
6685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
6695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
6705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
6715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
6725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
6735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
6745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
6755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
6765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
6775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t max12 = vmaxq_u8(max1, max2);
6785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t max123 = vmaxq_u8(max12, max3);
6795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
6805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
6815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t mask = vandq_u8(mask1, mask2);
6825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  return mask;
6835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
6845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
6855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//  4-points filter
6865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
6875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void ApplyFilter4(
6885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t p1, const int8x16_t p0,
6895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t q0, const int8x16_t q1,
6905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t delta0,
6915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t* const op1, uint8x16_t* const op0,
6925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t* const oq0, uint8x16_t* const oq1) {
6935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t kCst3 = vdupq_n_s8(0x03);
6945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t kCst4 = vdupq_n_s8(0x04);
6955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
6965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
6975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t a1 = vshrq_n_s8(delta1, 3);
6985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t a2 = vshrq_n_s8(delta2, 3);
6995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
7005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *op0 = FlipSignBack(vqaddq_s8(p0, a2));  // clip(p0 + a2)
7015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - a1)
7025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *op1 = FlipSignBack(vqaddq_s8(p1, a3));  // clip(p1 + a3)
7035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *oq1 = FlipSignBack(vqsubq_s8(q1, a3));  // clip(q1 - a3)
7045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
7055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
7065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void DoFilter4(
7075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t p1, const uint8x16_t p0,
7085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t q0, const uint8x16_t q1,
7095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask, const uint8x16_t hev_mask,
7105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t* const op1, uint8x16_t* const op0,
7115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t* const oq0, uint8x16_t* const oq1) {
7125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
7135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t p1s = FlipSign(p1);
7145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  int8x16_t p0s = FlipSign(p0);
7155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  int8x16_t q0s = FlipSign(q0);
7165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t q1s = FlipSign(q1);
7175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
7185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
7195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // do_filter2 part (simple loopfilter on pixels with hev)
7205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
7215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
7225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t simple_lf_delta =
7235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
7245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t tmp_p0, tmp_q0;
7255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
7265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
7275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    p0s = FlipSign(tmp_p0);
7285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    q0s = FlipSign(tmp_q0);
7295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
7305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
7315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // do_filter4 part (complex loopfilter on pixels without hev)
7325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
7335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
7345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
7355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
7365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t complex_lf_delta =
7375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
7385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
7395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
7405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
7415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
7425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//  6-points filter
7435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
7445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void ApplyFilter6(
7455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
7465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
7475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t delta,
7485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
7495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
7505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t kCst63 = vdupq_n_s16(63);
7515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t kCst27 = vdup_n_s8(27);
7525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t kCst18 = vdup_n_s8(18);
7535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t kCst9 = vdup_n_s8(9);
7545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t delta_lo = vget_low_s8(delta);
7555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t delta_hi = vget_high_s8(delta);
7565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo);  // 63 + 27 * a
7575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi);  // 63 + 27 * a
7585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo);  // 63 + 18 * a
7595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi);  // 63 + 18 * a
7605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo);   // 63 + 9 * a
7615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi);   // 63 + 9 * a
7625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
7635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
7645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
7655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
7665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
7675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
7685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
7695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
7705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
7715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
7725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *op0 = FlipSignBack(vqaddq_s8(p0, a1));  // clip(p0 + a1)
7735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - q1)
7745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *oq1 = FlipSignBack(vqsubq_s8(q1, a2));  // clip(q1 - a2)
7755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *op1 = FlipSignBack(vqaddq_s8(p1, a2));  // clip(p1 + a2)
7765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *oq2 = FlipSignBack(vqsubq_s8(q2, a3));  // clip(q2 - a3)
7775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *op2 = FlipSignBack(vqaddq_s8(p2, a3));  // clip(p2 + a3)
7785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
7795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
7805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void DoFilter6(
7815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
7825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
7835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask, const uint8x16_t hev_mask,
7845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
7855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
7865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
7875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t p2s = FlipSign(p2);
7885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t p1s = FlipSign(p1);
7895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  int8x16_t p0s = FlipSign(p0);
7905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  int8x16_t q0s = FlipSign(q0);
7915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t q1s = FlipSign(q1);
7925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t q2s = FlipSign(q2);
7935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
7945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
7955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
7965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // do_filter2 part (simple loopfilter on pixels with hev)
7975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
7985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t simple_lf_delta =
7995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
8005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t tmp_p0, tmp_q0;
8015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
8025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
8035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    p0s = FlipSign(tmp_p0);
8045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    q0s = FlipSign(tmp_q0);
8055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
8065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
8075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // do_filter6 part (complex loopfilter on pixels without hev)
8085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
8095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
8105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
8115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int8x16_t complex_lf_delta =
8125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
8135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
8145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                 op2, op1, op0, oq0, oq1, oq2);
8155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
8165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
8175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
8185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// on macroblock edges
8195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
8205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void VFilter16(uint8_t* p, int stride,
8215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      int thresh, int ithresh, int hev_thresh) {
8225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
8235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
8245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
8255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
8265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                         ithresh, thresh);
8275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
8285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
8295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
8305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)              &op2, &op1, &op0, &oq0, &oq1, &oq2);
8315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store16x2(op2, op1, p - 2 * stride, stride);
8325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store16x2(op0, oq0, p + 0 * stride, stride);
8335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store16x2(oq1, oq2, p + 2 * stride, stride);
8345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
8355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
8365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
8375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void HFilter16(uint8_t* p, int stride,
8385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      int thresh, int ithresh, int hev_thresh) {
8395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
8405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
8415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
8425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
8435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                         ithresh, thresh);
8445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
8455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
8465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
8475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)              &op2, &op1, &op0, &oq0, &oq1, &oq2);
8485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store2x16(op2, op1, p - 2, stride);
8495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store2x16(op0, oq0, p + 0, stride);
8505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store2x16(oq1, oq2, p + 2, stride);
8515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
8525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
8535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
8545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// on three inner edges
8555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void VFilter16i(uint8_t* p, int stride,
8565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                       int thresh, int ithresh, int hev_thresh) {
8575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint32_t k;
8585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p3, p2, p1, p0;
8595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load16x4(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
8605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  for (k = 3; k != 0; --k) {
8615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t q0, q1, q2, q3;
8625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    p += 4 * stride;
8635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Load16x4(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
8645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    {
8655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      const uint8x16_t mask =
8665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
8675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
8685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      // p3 and p2 are not just temporary variables here: they will be
8695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      // re-used for next span. And q2/q3 will become p1/p0 accordingly.
8705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
8715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      Store16x4(p1, p0, p3, p2, p, stride);
8725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      p1 = q2;
8735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      p0 = q3;
8745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    }
8755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
8765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
8775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC)
8795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void HFilter16i(uint8_t* p, int stride,
8805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                       int thresh, int ithresh, int hev_thresh) {
8815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint32_t k;
8825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p3, p2, p1, p0;
8835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
8845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  for (k = 3; k != 0; --k) {
8855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t q0, q1, q2, q3;
8865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    p += 4;
8875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
8885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    {
8895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      const uint8x16_t mask =
8905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
8915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
8925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
8935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      Store4x16(p1, p0, p3, p2, p, stride);
8945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      p1 = q2;
8955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      p0 = q3;
8965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    }
8975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
8985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
8995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif  // !WORK_AROUND_GCC
9005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
9015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// 8-pixels wide variant, for chroma filtering
9025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void VFilter8(uint8_t* u, uint8_t* v, int stride,
9035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                     int thresh, int ithresh, int hev_thresh) {
9045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
9055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
9065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
9075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
9085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                         ithresh, thresh);
9095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
9105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
9115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
9125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)              &op2, &op1, &op0, &oq0, &oq1, &oq2);
9135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
9145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
9155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
9165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
9175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
9185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
9195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      int thresh, int ithresh, int hev_thresh) {
9205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
9215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  u += 4 * stride;
9225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  v += 4 * stride;
9235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
9245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
9255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
9265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                         ithresh, thresh);
9275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
9285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t op1, op0, oq0, oq1;
9295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
9305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
9315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
9325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
9335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
9345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC)
9355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void HFilter8(uint8_t* u, uint8_t* v, int stride,
9365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                     int thresh, int ithresh, int hev_thresh) {
9375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
9385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
9395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
9405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
9415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                         ithresh, thresh);
9425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
9435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
9445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
9455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)              &op2, &op1, &op0, &oq0, &oq1, &oq2);
9465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
9475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
9485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
9495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
9505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
9515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      int thresh, int ithresh, int hev_thresh) {
9525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
9535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  u += 4;
9545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  v += 4;
9555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
9565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
9575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
9585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                         ithresh, thresh);
9595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
9605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    uint8x16_t op1, op0, oq0, oq1;
9615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
9625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
9635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
9645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
9655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif  // !WORK_AROUND_GCC
9665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9672a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//-----------------------------------------------------------------------------
9682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Inverse transforms (Paragraph 14.4)
9692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
9705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Technically these are unsigned but vqdmulh is only available in signed.
9715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// vqdmulh returns high half (effectively >> 16) but also doubles the value,
9725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// changing the >> 16 to >> 15 and requiring an additional >> 1.
9735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// We use this to our advantage with kC2. The canonical value is 35468.
9745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// However, the high bit is set so treating it as signed will give incorrect
9755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// results. We avoid this by down shifting by 1 here to clear the highest bit.
9765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Combined with the doubling effect of vqdmulh we get >> 16.
9775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// This can not be applied to kC1 because the lowest bit is set. Down shifting
9785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// the constant would reduce precision.
9795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
9805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// libwebp uses a trick to avoid some extra addition that libvpx does.
9815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Instead of:
9825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
9835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
9845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// same issue with kC1 and vqdmulh that we work around by down shifting kC2
9855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
9865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static const int16_t kC1 = 20091;
9875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
9885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
9895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if defined(USE_INTRINSICS)
9905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
9915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                     int16x8x2_t* const out) {
9925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
9935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
9945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
9955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                                  // b0 d0 b1 d1 b2 d2 ...
9965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
9975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
9985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
9995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
10005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // {rows} = in0 | in4
10015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  //          in8 | in12
10025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // B1 = in4 | in12
10035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t B1 =
10045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
10055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // C0 = kC1 * in4 | kC1 * in12
10065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // C1 = kC2 * in4 | kC2 * in12
10075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
10085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
10095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
10105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                vget_low_s16(rows->val[1]));   // in0 + in8
10115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
10125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                                vget_low_s16(rows->val[1]));   // in0 - in8
10135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // c = kC2 * in4 - kC1 * in12
10145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // d = kC1 * in4 + kC2 * in12
10155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
10165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
10175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
10185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
10195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
10205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
10215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
10225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Transpose8x2(E0, E1, rows);
10235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
10245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
10255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static void TransformOne(const int16_t* in, uint8_t* dst) {
10265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  int16x8x2_t rows;
10275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
10285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  TransformPass(&rows);
10295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  TransformPass(&rows);
10305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Add4x4(rows.val[0], rows.val[1], dst);
10315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
10325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
10335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#else
10345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void TransformOne(const int16_t* in, uint8_t* dst) {
10365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int kBPS = BPS;
10375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  // kC1, kC2. Padded because vld1.16 loads 8 bytes
10385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16_t constants[4] = { kC1, kC2, 0, 0 };
10395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
10405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  __asm__ volatile (
10415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.16         {q1, q2}, [%[in]]           \n"
10425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.16         {d0}, [%[constants]]        \n"
10435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d2: in[0]
10455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d3: in[8]
10465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d4: in[4]
10475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d5: in[12]
10485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
10495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp            d3, d4                      \n"
10505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
10525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * q9 = {in[4], in[12]} * kC2 >> 16
10535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
10545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q8, q2, d0[0]               \n"
10555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q9, q2, d0[1]               \n"
10565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d22 = a = in[0] + in[8]
10585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d23 = b = in[0] - in[8]
10595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
10605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d22, d2, d3                 \n"
10615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d23, d2, d3                 \n"
10625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* The multiplication should be x * kC1 >> 16
10645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * However, with vqdmulh we get x * kC1 * 2 >> 16
10655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * (multiply, double, return high half)
10665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * We avoided this in kC2 by pre-shifting the constant.
10675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * q8 = in[4]/[12] * kC1 >> 16
10685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
10695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vshr.s16        q8, q8, #1                  \n"
10705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* Add {in[4], in[12]} back after the multiplication. This is handled by
10725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * adding 1 << 16 to kC1 in the libwebp C code.
10735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
10745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q8, q2, q8                  \n"
10755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d20 = c = in[4]*kC2 - in[12]*kC1
10775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d21 = d = in[4]*kC1 + in[12]*kC2
10785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
10795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d20, d18, d17               \n"
10805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d21, d19, d16               \n"
10815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d2 = tmp[0] = a + d
10835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d3 = tmp[1] = b + c
10845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d4 = tmp[2] = b - c
10855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d5 = tmp[3] = a - d
10865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
10875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d2, d22, d21                \n"
10885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d3, d23, d20                \n"
10895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d4, d23, d20                \n"
10905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d5, d22, d21                \n"
10915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
10935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
10945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vswp            d3, d4                      \n"
10965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
10985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
10995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
11005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q8, q2, d0[0]               \n"
11015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqdmulh.s16     q9, q2, d0[1]               \n"
11025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d22 = a = tmp[0] + tmp[8]
11045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d23 = b = tmp[0] - tmp[8]
11055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
11065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d22, d2, d3                 \n"
11075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d23, d2, d3                 \n"
11085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* See long winded explanations prior */
11105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vshr.s16        q8, q8, #1                  \n"
11115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q8, q2, q8                  \n"
11125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d20 = c = in[4]*kC2 - in[12]*kC1
11145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d21 = d = in[4]*kC1 + in[12]*kC2
11155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
11165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d20, d18, d17               \n"
11175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d21, d19, d16               \n"
11185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* d2 = tmp[0] = a + d
11205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d3 = tmp[1] = b + c
11215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d4 = tmp[2] = b - c
11225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     * d5 = tmp[3] = a - d
11235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)     */
11245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d2, d22, d21                \n"
11255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       d3, d23, d20                \n"
11265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d4, d23, d20                \n"
11275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqsub.s16       d5, d22, d21                \n"
11285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
11305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
11315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
11325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
11335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
11355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* (val) + 4 >> 3 */
11375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d2, d2, #3                  \n"
11385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d3, d3, #3                  \n"
11395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d4, d4, #3                  \n"
11405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vrshr.s16       d5, d5, #3                  \n"
11415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
11435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vzip.16         q1, q2                      \n"
11445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* Must accumulate before saturating */
11465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vmovl.u8        q8, d6                      \n"
11475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vmovl.u8        q9, d7                      \n"
11485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q1, q1, q8                  \n"
11505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqadd.s16       q2, q2, q9                  \n"
11515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqmovun.s16     d0, q1                      \n"
11535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vqmovun.s16     d1, q2                      \n"
11545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
11565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
11575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
11585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    "vst1.32         d1[1], [%[dst]]             \n"
11595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
11615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
11625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
11635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
11645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
11655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif    // USE_INTRINSICS
11675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
11685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
11695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  TransformOne(in, dst);
11705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (do_two) {
11715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    TransformOne(in + 16, dst + 4);
11725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
11735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
11745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static void TransformDC(const int16_t* in, uint8_t* dst) {
11765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t DC = vdupq_n_s16(in[0]);
11775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Add4x4(DC, DC, dst);
11785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)}
11795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
11805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------
11815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
11825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define STORE_WHT(dst, col, rows) do {                  \
11835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
11845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
11855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
11865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
11875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} while (0)
11885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
11895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void TransformWHT(const int16_t* in, int16_t* out) {
11905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  int32x4x4_t tmp;
11915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
11925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
11935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // Load the source.
11945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int16x4_t in00_03 = vld1_s16(in + 0);
11955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int16x4_t in04_07 = vld1_s16(in + 4);
11965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int16x4_t in08_11 = vld1_s16(in + 8);
11975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int16x4_t in12_15 = vld1_s16(in + 12);
11985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
11995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
12005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
12015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
12025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[0] = vaddq_s32(a0, a1);
12035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[1] = vaddq_s32(a3, a2);
12045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[2] = vsubq_s32(a0, a1);
12055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[3] = vsubq_s32(a3, a2);
12065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // Arrange the temporary results column-wise.
12075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp = Transpose4x4(tmp);
12085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
12095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
12105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  {
12115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t kCst3 = vdupq_n_s32(3);
12125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
12135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
12145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
12155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
12165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
12175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
12185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[0] = vaddq_s32(a0, a1);
12195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[1] = vaddq_s32(a3, a2);
12205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[2] = vsubq_s32(a0, a1);
12215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[3] = vsubq_s32(a3, a2);
12225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
12235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    // right shift the results by 3.
12245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
12255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
12265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
12275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
12285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
12295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    STORE_WHT(out, 0, tmp);
12305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    STORE_WHT(out, 1, tmp);
12315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    STORE_WHT(out, 2, tmp);
12325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)    STORE_WHT(out, 3, tmp);
12335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  }
12345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
12355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
12365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef STORE_WHT
12375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
12385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------
12395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)
12405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define MUL(a, b) (((a) * (b)) >> 16)
12415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void TransformAC3(const int16_t* in, uint8_t* dst) {
12425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  static const int kC1_full = 20091 + (1 << 16);
12435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  static const int kC2_full = 35468;
12445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t A = vdup_n_s16(in[0]);
12455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
12465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
12475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int c1 = MUL(in[1], kC2_full);
12485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int d1 = MUL(in[1], kC1_full);
12495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
12505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      (uint64_t)( c1 & 0xffff) << 16 |
12515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      (uint64_t)(-c1 & 0xffff) << 32 |
12525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)                      (uint64_t)(-d1 & 0xffff) << 48;
12535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t CD = vcreate_s16(cd);
12545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x4_t B = vqadd_s16(A, CD);
12555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
12565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
12575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  Add4x4(m0_m1, m2_m3, dst);
12582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
12595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef MUL
12602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
12612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif   // WEBP_USE_NEON
12622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
12632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//------------------------------------------------------------------------------
12642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Entry point
12652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
12665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern void VP8DspInitNEON(void);
12675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
12685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void VP8DspInitNEON(void) {
12692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON)
12705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  VP8Transform = TransformTwo;
12715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8TransformAC3 = TransformAC3;
12725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  VP8TransformDC = TransformDC;
12732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  VP8TransformWHT = TransformWHT;
12745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
12755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8VFilter16 = VFilter16;
12765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8VFilter16i = VFilter16i;
12775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8HFilter16 = HFilter16;
12785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC)
12795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8HFilter16i = HFilter16i;
12805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif
12815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8VFilter8 = VFilter8;
12825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8VFilter8i = VFilter8i;
12835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC)
12845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8HFilter8 = HFilter8;
12855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8HFilter8i = HFilter8i;
12865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif
12875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8SimpleVFilter16 = SimpleVFilter16;
12885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8SimpleHFilter16 = SimpleHFilter16;
12895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8SimpleVFilter16i = SimpleVFilter16i;
12905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  VP8SimpleHFilter16i = SimpleHFilter16i;
12912a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif   // WEBP_USE_NEON
12925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1293