15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2012 Google Inc. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// Use of this source code is governed by a BSD-style license 4eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// that can be found in the COPYING file in the root of the source 5eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// tree. An additional intellectual property rights grant can be found 6eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// in the file PATENTS. All contributing project authors may 7eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// be found in the AUTHORS file in the root of the source tree. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ----------------------------------------------------------------------------- 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ARM NEON version of dsp functions and loop filtering. 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Authors: Somnath Banerjee (somnath@google.com) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Johann Koenig (johannkoenig@google.com) 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "./dsp.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON) 182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#include "./neon.h" 202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "../dec/vp8i.h" 212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------ 235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// NxM Loading functions 245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Load/Store vertical edge 265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ 275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \ 285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \ 295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \ 305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \ 315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \ 325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \ 335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \ 345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n" 355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define STORE8x2(c1, c2, p, stride) \ 375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \ 385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \ 395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \ 405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \ 415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \ 425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \ 435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \ 445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n" 455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC) 475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation 495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// (register alloc, probably). The variants somewhat mitigate the problem, but 505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// not quite. HFilter16i() remains problematic. 515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) { 525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x8_t zero = vdup_n_u8(0); 535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x4_t out; 545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR4(out, zero, zero, zero, zero); 555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) out = vld4_lane_u8(src + 0 * stride, out, 0); 565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) out = vld4_lane_u8(src + 1 * stride, out, 1); 575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) out = vld4_lane_u8(src + 2 * stride, out, 2); 585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) out = vld4_lane_u8(src + 3 * stride, out, 3); 595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) out = vld4_lane_u8(src + 4 * stride, out, 4); 605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) out = vld4_lane_u8(src + 5 * stride, out, 5); 615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) out = vld4_lane_u8(src + 6 * stride, out, 6); 625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) out = vld4_lane_u8(src + 7 * stride, out, 7); 635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return out; 645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride, 675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p1, uint8x16_t* const p0, 685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q0, uint8x16_t* const q1) { 695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7] 705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15] 715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride); 725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride); 735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p1 = vcombine_u8(row0.val[0], row8.val[0]); 745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p0 = vcombine_u8(row0.val[1], row8.val[1]); 755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q0 = vcombine_u8(row0.val[2], row8.val[2]); 765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q1 = vcombine_u8(row0.val[3], row8.val[3]); 775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#else // WORK_AROUND_GCC 805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define LOADQ_LANE_32b(VALUE, LANE) do { \ 825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \ 835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) src += stride; \ 845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} while (0) 855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load4x16(const uint8_t* src, int stride, 875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p1, uint8x16_t* const p0, 885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q0, uint8x16_t* const q1) { 895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint32x4_t zero = vdupq_n_u32(0); 905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint32x4x4_t in; 915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR4(in, zero, zero, zero, zero); 925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) src -= 2; 935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[0], 0); 945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[1], 0); 955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[2], 0); 965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[3], 0); 975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[0], 1); 985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[1], 1); 995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[2], 1); 1005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[3], 1); 1015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[0], 2); 1025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[1], 2); 1035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[2], 2); 1045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[3], 2); 1055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[0], 3); 1065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[1], 3); 1075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[2], 3); 1085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) LOADQ_LANE_32b(in.val[3], 3); 1095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Transpose four 4x4 parts: 1105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 1115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]), 1125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u8_u32(in.val[1])); 1135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]), 1145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u8_u32(in.val[3])); 1155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]), 1165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u16_u8(row23.val[0])); 1175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]), 1185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u16_u8(row23.val[1])); 1195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p1 = vreinterpretq_u8_u16(row02.val[0]); 1205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p0 = vreinterpretq_u8_u16(row13.val[0]); 1215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q0 = vreinterpretq_u8_u16(row02.val[1]); 1225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q1 = vreinterpretq_u8_u16(row13.val[1]); 1235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 1245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 1255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef LOADQ_LANE_32b 1265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 1275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif // !WORK_AROUND_GCC 1285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 1295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride, 1305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p3, uint8x16_t* const p2, 1315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p1, uint8x16_t* const p0, 1325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q0, uint8x16_t* const q1, 1335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q2, uint8x16_t* const q3) { 1345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load4x16(src - 2, stride, p3, p2, p1, p0); 1355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load4x16(src + 2, stride, q0, q1, q2, q3); 1365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 1375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 1385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride, 1395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p1, uint8x16_t* const p0, 1405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q0, uint8x16_t* const q1) { 1415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p1 = vld1q_u8(src - 2 * stride); 1425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p0 = vld1q_u8(src - 1 * stride); 1435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q0 = vld1q_u8(src + 0 * stride); 1445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q1 = vld1q_u8(src + 1 * stride); 1455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 1465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 1475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride, 1485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p3, uint8x16_t* const p2, 1495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p1, uint8x16_t* const p0, 1505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q0, uint8x16_t* const q1, 1515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q2, uint8x16_t* const q3) { 1525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load16x4(src - 2 * stride, stride, p3, p2, p1, p0); 1535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load16x4(src + 2 * stride, stride, q0, q1, q2, q3); 1545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 1555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 1565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load8x8x2(const uint8_t* const u, 1575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8_t* const v, 1585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int stride, 1595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p3, uint8x16_t* const p2, 1605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p1, uint8x16_t* const p0, 1615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q0, uint8x16_t* const q1, 1625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q2, uint8x16_t* const q3) { 1635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination 1645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // and the v-samples on the higher half. 1655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride)); 1665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride)); 1675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride)); 1685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride)); 1695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride)); 1705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride)); 1715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride)); 1725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride)); 1735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 1745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 1755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC) 1765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 1775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define LOAD_UV_8(ROW) \ 1785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride)) 1795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 1805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Load8x8x2T(const uint8_t* const u, 1815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8_t* const v, 1825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int stride, 1835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p3, uint8x16_t* const p2, 1845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const p1, uint8x16_t* const p0, 1855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q0, uint8x16_t* const q1, 1865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const q2, uint8x16_t* const q3) { 1875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination 1885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // and the v-samples on the higher half. 1895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t row0 = LOAD_UV_8(0); 1905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t row1 = LOAD_UV_8(1); 1915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t row2 = LOAD_UV_8(2); 1925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t row3 = LOAD_UV_8(3); 1935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t row4 = LOAD_UV_8(4); 1945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t row5 = LOAD_UV_8(5); 1955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t row6 = LOAD_UV_8(6); 1965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t row7 = LOAD_UV_8(7); 1975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Perform two side-by-side 8x8 transposes 1985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07 1995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ... 2005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ... 2015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u30 u31 u32 u33 u34 u35 u36 u37 | ... 2025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u40 u41 u42 u43 u44 u45 u46 u47 | ... 2035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u50 u51 u52 u53 u54 u55 u56 u57 | ... 2045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ... 2055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ... 2065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ... 2075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u01 u11 u03 u13 ... 2085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ... 2095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // u21 u31 u23 u33 ... 2105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ... 2115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ... 2125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]), 2135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u16_u8(row23.val[0])); 2145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]), 2155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u16_u8(row23.val[1])); 2165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]), 2175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u16_u8(row67.val[0])); 2185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]), 2195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u16_u8(row67.val[1])); 2205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]), 2215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u32_u16(row46.val[0])); 2225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]), 2235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u32_u16(row46.val[1])); 2245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]), 2255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u32_u16(row57.val[0])); 2265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]), 2275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vreinterpretq_u32_u16(row57.val[1])); 2285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p3 = vreinterpretq_u8_u32(row04.val[0]); 2295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p2 = vreinterpretq_u8_u32(row15.val[0]); 2305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p1 = vreinterpretq_u8_u32(row26.val[0]); 2315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *p0 = vreinterpretq_u8_u32(row37.val[0]); 2325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q0 = vreinterpretq_u8_u32(row04.val[1]); 2335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q1 = vreinterpretq_u8_u32(row15.val[1]); 2345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q2 = vreinterpretq_u8_u32(row26.val[1]); 2355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *q3 = vreinterpretq_u8_u32(row37.val[1]); 2365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 2375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef LOAD_UV_8 2385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 2395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif // !WORK_AROUND_GCC 2405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 2415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store2x8(const uint8x8x2_t v, 2425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const dst, int stride) { 2435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst2_lane_u8(dst + 0 * stride, v, 0); 2445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst2_lane_u8(dst + 1 * stride, v, 1); 2455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst2_lane_u8(dst + 2 * stride, v, 2); 2465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst2_lane_u8(dst + 3 * stride, v, 3); 2475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst2_lane_u8(dst + 4 * stride, v, 4); 2485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst2_lane_u8(dst + 5 * stride, v, 5); 2495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst2_lane_u8(dst + 6 * stride, v, 6); 2505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst2_lane_u8(dst + 7 * stride, v, 7); 2515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 2525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 2535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0, 2545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const dst, int stride) { 2555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x2_t lo, hi; 2565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) lo.val[0] = vget_low_u8(p0); 2575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) lo.val[1] = vget_low_u8(q0); 2585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) hi.val[0] = vget_high_u8(p0); 2595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) hi.val[1] = vget_high_u8(q0); 2605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store2x8(lo, dst - 1 + 0 * stride, stride); 2615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store2x8(hi, dst - 1 + 8 * stride, stride); 2625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 2635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 2645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC) 2655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store4x8(const uint8x8x4_t v, 2665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const dst, int stride) { 2675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(dst + 0 * stride, v, 0); 2685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(dst + 1 * stride, v, 1); 2695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(dst + 2 * stride, v, 2); 2705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(dst + 3 * stride, v, 3); 2715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(dst + 4 * stride, v, 4); 2725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(dst + 5 * stride, v, 5); 2735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(dst + 6 * stride, v, 6); 2745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(dst + 7 * stride, v, 7); 2755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 2765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 2775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0, 2785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 2795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const dst, int stride) { 2805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x4_t lo, hi; 2815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR4(lo, 2825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_low_u8(p1), vget_low_u8(p0), 2835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_low_u8(q0), vget_low_u8(q1)); 2845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR4(hi, 2855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_high_u8(p1), vget_high_u8(p0), 2865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_high_u8(q0), vget_high_u8(q1)); 2875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store4x8(lo, dst - 2 + 0 * stride, stride); 2885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store4x8(hi, dst - 2 + 8 * stride, stride); 2895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 2905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif // !WORK_AROUND_GCC 2915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 2925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0, 2935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const dst, int stride) { 2945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1q_u8(dst - stride, p0); 2955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1q_u8(dst, q0); 2965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 2975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 2985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0, 2995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 3005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const dst, int stride) { 3015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store16x2(p1, p0, dst - stride, stride); 3025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store16x2(q0, q1, dst + stride, stride); 3035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 3045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0, 3065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const u, uint8_t* const v, 3075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int stride) { 3085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // p0 and q0 contain the u+v samples packed in low/high halves. 3095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1_u8(u - stride, vget_low_u8(p0)); 3105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1_u8(u, vget_low_u8(q0)); 3115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1_u8(v - stride, vget_high_u8(p0)); 3125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1_u8(v, vget_high_u8(q0)); 3135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 3145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0, 3165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 3175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const u, uint8_t* const v, 3185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int stride) { 3195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // The p1...q1 registers contain the u+v samples packed in low/high halves. 3205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store8x2x2(p1, p0, u - stride, v - stride, stride); 3215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store8x2x2(q0, q1, u + stride, v + stride, stride); 3225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 3235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC) 3255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \ 3275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \ 3285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \ 3295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) (DST) += stride; \ 3305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} while (0) 3315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1, 3335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t p0, const uint8x16_t q0, 3345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q1, const uint8x16_t q2, 3355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* u, uint8_t* v, 3365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int stride) { 3375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x3_t u0, u1, v0, v1; 3385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0)); 3395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2)); 3405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0)); 3415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2)); 3425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(u, u0, u1, 0); 3435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(u, u0, u1, 1); 3445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(u, u0, u1, 2); 3455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(u, u0, u1, 3); 3465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(u, u0, u1, 4); 3475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(u, u0, u1, 5); 3485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(u, u0, u1, 6); 3495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(u, u0, u1, 7); 3505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(v, v0, v1, 0); 3515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(v, v0, v1, 1); 3525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(v, v0, v1, 2); 3535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(v, v0, v1, 3); 3545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(v, v0, v1, 4); 3555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(v, v0, v1, 5); 3565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(v, v0, v1, 6); 3575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE6_LANE(v, v0, v1, 7); 3585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 3595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef STORE6_LANE 3605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, 3625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 3635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const u, uint8_t* const v, 3645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int stride) { 3655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x4_t u0, v0; 3665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR4(u0, 3675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_low_u8(p1), vget_low_u8(p0), 3685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_low_u8(q0), vget_low_u8(q1)); 3695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR4(v0, 3705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_high_u8(p1), vget_high_u8(p0), 3715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_high_u8(q0), vget_high_u8(q1)); 3725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(u - 2 + 0 * stride, u0, 0); 3735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(u - 2 + 1 * stride, u0, 1); 3745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(u - 2 + 2 * stride, u0, 2); 3755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(u - 2 + 3 * stride, u0, 3); 3765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(u - 2 + 4 * stride, u0, 4); 3775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(u - 2 + 5 * stride, u0, 5); 3785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(u - 2 + 6 * stride, u0, 6); 3795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(u - 2 + 7 * stride, u0, 7); 3805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(v - 2 + 0 * stride, v0, 0); 3815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(v - 2 + 1 * stride, v0, 1); 3825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(v - 2 + 2 * stride, v0, 2); 3835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(v - 2 + 3 * stride, v0, 3); 3845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(v - 2 + 4 * stride, v0, 4); 3855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(v - 2 + 5 * stride, v0, 5); 3865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(v - 2 + 6 * stride, v0, 6); 3875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst4_lane_u8(v - 2 + 7 * stride, v0, 7); 3885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 3895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif // !WORK_AROUND_GCC 3915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. 3935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { 3945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); 3955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 3965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 3975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result 3985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// to the corresponding rows of 'dst'. 3995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, 4005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t dst01, 4015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t dst23) { 4025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Unsigned saturate to 8b. 4035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x8_t dst01_u8 = vqmovun_s16(dst01); 4045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x8_t dst23_u8 = vqmovun_s16(dst23); 4055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Store the results. 4075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0); 4085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1); 4095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0); 4105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); 4115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 4125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, 4145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8_t* const dst) { 4155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint32x2_t dst01 = vdup_n_u32(0); 4165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint32x2_t dst23 = vdup_n_u32(0); 4175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Load the source pixels. 4195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0); 4205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0); 4215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1); 4225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1); 4235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 4255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Convert to 16b. 4265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); 4275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); 4285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Descale with rounding. 4305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); 4315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); 4325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Add the inverse transform. 4335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) SaturateAndStore4x4(dst, out01, out23); 4345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 4355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 4365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//----------------------------------------------------------------------------- 4385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Simple In-loop filtering (Paragraph 15.2) 4395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0, 4415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 4425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh) { 4435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh); 4445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0) 4455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1) 4465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0) 4475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2 4485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2); 4495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = vcgeq_u8(thresh_v, sum); 4505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return mask; 4515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 4525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static int8x16_t FlipSign(const uint8x16_t v) { 4545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t sign_bit = vdupq_n_u8(0x80); 4555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return vreinterpretq_s8_u8(veorq_u8(v, sign_bit)); 4565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 4575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static uint8x16_t FlipSignBack(const int8x16_t v) { 4595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t sign_bit = vdupq_n_s8(0x80); 4605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return vreinterpretq_u8_s8(veorq_s8(v, sign_bit)); 4615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 4625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0, 4645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q0, const int8x16_t q1) { 4655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) 4665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1) 4675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0) 4685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0) 4695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0) 4705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return s3; 4715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 4725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { 4745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) 4755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0) 4765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0) 4775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return s2; 4785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 4795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------ 4815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, 4835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta, 4845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const op0, uint8x16_t* const oq0) { 4855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t kCst3 = vdupq_n_s8(0x03); 4865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t kCst4 = vdupq_n_s8(0x04); 4875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); 4885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); 4895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); 4905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); 4915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t sp0 = vqaddq_s8(p0s, delta3); 4925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t sq0 = vqsubq_s8(q0s, delta4); 4935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *op0 = FlipSignBack(sp0); 4945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *oq0 = FlipSignBack(sq0); 4955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 4965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if defined(USE_INTRINSICS) 4985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 4995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, 5005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 5015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask, 5025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const op0, uint8x16_t* const oq0) { 5035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t p1s = FlipSign(p1); 5045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t p0s = FlipSign(p0); 5055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q0s = FlipSign(q0); 5065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q1s = FlipSign(q1); 5075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); 5085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); 5095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ApplyFilter2(p0s, q0s, delta1, op0, oq0); 5105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 5115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 5125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { 5135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p1, p0, q0, q1, op0, oq0; 5145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load16x4(p, stride, &p1, &p0, &q0, &q1); 5155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 5165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); 5175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); 5185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 5195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store16x2(op0, oq0, p, stride); 5205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 5215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 5225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { 5235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p1, p0, q0, q1, oq0, op0; 5245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load4x16(p, stride, &p1, &p0, &q0, &q1); 5255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 5265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); 5275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); 5285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 5295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store2x16(op0, oq0, p, stride); 5305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 5315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 5325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#else 5335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 5345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define QRegs "q0", "q1", "q2", "q3", \ 5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FLIP_SIGN_BIT2(a, b, s) \ 5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "veor " #a "," #a "," #s " \n" \ 5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "veor " #b "," #b "," #s " \n" \ 5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FLIP_SIGN_BIT4(a, b, c, d, s) \ 5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLIP_SIGN_BIT2(a, b, s) \ 5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLIP_SIGN_BIT2(c, d, s) \ 5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \ 5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \ 5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \ 5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \ 5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \ 5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ 5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vdup.8 q14, " #thresh " \n" \ 5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */ 5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GET_BASE_DELTA(p1, p0, q0, q1, o) \ 5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \ 5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \ 5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \ 5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \ 5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */ 5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DO_SIMPLE_FILTER(p0, q0, fl) \ 5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmov.i8 q15, #0x03 \n" \ 5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \ 5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \ 5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \ 5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) \ 5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmov.i8 q15, #0x04 \n" \ 5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \ 5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \ 5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */ 5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Applies filter on 2 pixels (p0 and q0) 5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DO_FILTER2(p1, p0, q0, q1, thresh) \ 5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \ 5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmov.i8 q10, #0x80 \n" /* sign bit */ \ 5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \ 5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \ 5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vand q9, q9, q11 \n" /* apply filter mask */ \ 5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ 5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLIP_SIGN_BIT2(p0, q0, q10) 5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { 5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __asm__ volatile ( 5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 5895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "vld1.u8 {q12}, [%[p]] \n" // q1 5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DO_FILTER2(q1, q2, q3, q12, %[thresh]) 5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0 5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.u8 {q3}, [%[p]] \n" // store oq0 5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [p] "+r"(p) 5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [stride] "r"(stride), [thresh] "r"(thresh) 5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : "memory", QRegs 6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ); 6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { 6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __asm__ volatile ( 6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub r4, %[p], #2 \n" // base1 = p - 2 6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "lsl r6, %[stride], #1 \n" // r6 = 2 * stride 6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "add r5, r4, %[stride] \n" // base2 = base1 + stride 6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) 6105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6) 6115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "vswp d3, d24 \n" // p1:q1 p0:q3 6125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "vswp d5, d26 \n" // q0:q2 q1:q4 6135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4 6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) DO_FILTER2(q1, q2, q12, q13, %[thresh]) 6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[p], %[p], #1 \n" // p - 1 6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6195d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) "vswp d5, d24 \n" 6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) STORE8x2(d4, d5, [%[p]], %[stride]) 6215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) STORE8x2(d24, d25, [%[p]], %[stride]) 6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [p] "+r"(p) 6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [stride] "r"(stride), [thresh] "r"(thresh) 6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : "memory", "r4", "r5", "r6", QRegs 6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ); 6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif // USE_INTRINSICS 6305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 6315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { 6325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint32_t k; 6335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) for (k = 3; k != 0; --k) { 6345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) p += 4 * stride; 6355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) SimpleVFilter16(p, stride, thresh); 6365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 6375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 6385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 6395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { 6405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint32_t k; 6415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) for (k = 3; k != 0; --k) { 6425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) p += 4; 6435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) SimpleHFilter16(p, stride, thresh); 6445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 6455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 6465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 6475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------ 6485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Complex In-loop filtering (Paragraph 15.3) 6495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 6505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, 6515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 6525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int hev_thresh) { 6535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); 6545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) 6555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) 6565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v); 6575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v); 6585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = vorrq_u8(mask1, mask2); 6595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return mask; 6605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 6615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 6625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, 6635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t p1, const uint8x16_t p0, 6645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 6655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q2, const uint8x16_t q3, 6665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int ithresh, int thresh) { 6675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); 6685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) 6695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1) 6705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) 6715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2) 6725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1) 6735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) 6745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1); 6755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2); 6765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0); 6775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t max12 = vmaxq_u8(max1, max2); 6785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t max123 = vmaxq_u8(max12, max3); 6795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123); 6805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh); 6815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = vandq_u8(mask1, mask2); 6825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) return mask; 6835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 6845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 6855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// 4-points filter 6865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 6875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void ApplyFilter4( 6885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t p1, const int8x16_t p0, 6895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q0, const int8x16_t q1, 6905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta0, 6915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const op1, uint8x16_t* const op0, 6925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const oq0, uint8x16_t* const oq1) { 6935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t kCst3 = vdupq_n_s8(0x03); 6945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t kCst4 = vdupq_n_s8(0x04); 6955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta1 = vqaddq_s8(delta0, kCst4); 6965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta2 = vqaddq_s8(delta0, kCst3); 6975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t a1 = vshrq_n_s8(delta1, 3); 6985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t a2 = vshrq_n_s8(delta2, 3); 6995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1 7005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2) 7015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1) 7025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3) 7035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3) 7045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 7055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 7065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void DoFilter4( 7075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t p1, const uint8x16_t p0, 7085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, 7095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask, const uint8x16_t hev_mask, 7105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const op1, uint8x16_t* const op0, 7115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const oq0, uint8x16_t* const oq1) { 7125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // This is a fused version of DoFilter2() calling ApplyFilter2 directly 7135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t p1s = FlipSign(p1); 7145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int8x16_t p0s = FlipSign(p0); 7155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int8x16_t q0s = FlipSign(q0); 7165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q1s = FlipSign(q1); 7175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); 7185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 7195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // do_filter2 part (simple loopfilter on pixels with hev) 7205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 7215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); 7225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t simple_lf_delta = 7235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); 7245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t tmp_p0, tmp_q0; 7255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); 7265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here 7275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) p0s = FlipSign(tmp_p0); 7285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) q0s = FlipSign(tmp_q0); 7295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 7305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 7315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // do_filter4 part (complex loopfilter on pixels without hev) 7325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 7335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta0 = GetBaseDelta0(p0s, q0s); 7345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // we use: (mask & hev_mask) ^ mask = mask & !hev_mask 7355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); 7365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t complex_lf_delta = 7375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); 7385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); 7395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 7405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 7415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 7425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// 6-points filter 7435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 7445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void ApplyFilter6( 7455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t p2, const int8x16_t p1, const int8x16_t p0, 7465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q0, const int8x16_t q1, const int8x16_t q2, 7475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta, 7485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, 7495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { 7505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t kCst63 = vdupq_n_s16(63); 7515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t kCst27 = vdup_n_s8(27); 7525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t kCst18 = vdup_n_s8(18); 7535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t kCst9 = vdup_n_s8(9); 7545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t delta_lo = vget_low_s8(delta); 7555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t delta_hi = vget_high_s8(delta); 7565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a 7575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a 7585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a 7595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a 7605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a 7615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a 7625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7); 7635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7); 7645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7); 7655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7); 7665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7); 7675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7); 7685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi); 7695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); 7705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); 7715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 7725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1) 7735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1) 7745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2) 7755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2) 7765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3) 7775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3) 7785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 7795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 7805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void DoFilter6( 7815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, 7825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, 7835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask, const uint8x16_t hev_mask, 7845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, 7855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { 7865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // This is a fused version of DoFilter2() calling ApplyFilter2 directly 7875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t p2s = FlipSign(p2); 7885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t p1s = FlipSign(p1); 7895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int8x16_t p0s = FlipSign(p0); 7905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int8x16_t q0s = FlipSign(q0); 7915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q1s = FlipSign(q1); 7925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t q2s = FlipSign(q2); 7935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); 7945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); 7955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 7965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // do_filter2 part (simple loopfilter on pixels with hev) 7975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 7985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t simple_lf_delta = 7995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); 8005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t tmp_p0, tmp_q0; 8015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); 8025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here 8035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) p0s = FlipSign(tmp_p0); 8045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) q0s = FlipSign(tmp_q0); 8055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 8065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 8075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // do_filter6 part (complex loopfilter on pixels without hev) 8085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 8095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // we use: (mask & hev_mask) ^ mask = mask & !hev_mask 8105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); 8115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int8x16_t complex_lf_delta = 8125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); 8135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, 8145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) op2, op1, op0, oq0, oq1, oq2); 8155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 8165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 8175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 8185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// on macroblock edges 8195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 8205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void VFilter16(uint8_t* p, int stride, 8215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh, int ithresh, int hev_thresh) { 8225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 8235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 8245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 8255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 8265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ithresh, thresh); 8275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 8285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t op2, op1, op0, oq0, oq1, oq2; 8295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, 8305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) &op2, &op1, &op0, &oq0, &oq1, &oq2); 8315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store16x2(op2, op1, p - 2 * stride, stride); 8325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store16x2(op0, oq0, p + 0 * stride, stride); 8335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store16x2(oq1, oq2, p + 2 * stride, stride); 8345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 8355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 8365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 8375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void HFilter16(uint8_t* p, int stride, 8385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh, int ithresh, int hev_thresh) { 8395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 8405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 8415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 8425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 8435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ithresh, thresh); 8445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 8455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t op2, op1, op0, oq0, oq1, oq2; 8465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, 8475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) &op2, &op1, &op0, &oq0, &oq1, &oq2); 8485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store2x16(op2, op1, p - 2, stride); 8495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store2x16(op0, oq0, p + 0, stride); 8505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store2x16(oq1, oq2, p + 2, stride); 8515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 8525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 8535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 8545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// on three inner edges 8555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void VFilter16i(uint8_t* p, int stride, 8565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh, int ithresh, int hev_thresh) { 8575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint32_t k; 8585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p3, p2, p1, p0; 8595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0); 8605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) for (k = 3; k != 0; --k) { 8615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t q0, q1, q2, q3; 8625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) p += 4 * stride; 8635f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3); 8645f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 8655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = 8665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); 8675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 8685f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // p3 and p2 are not just temporary variables here: they will be 8695f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // re-used for next span. And q2/q3 will become p1/p0 accordingly. 8705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); 8715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store16x4(p1, p0, p3, p2, p, stride); 8725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) p1 = q2; 8735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) p0 = q3; 8745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 8755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 8775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC) 8795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void HFilter16i(uint8_t* p, int stride, 8805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh, int ithresh, int hev_thresh) { 8815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint32_t k; 8825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p3, p2, p1, p0; 8835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load4x16(p + 2, stride, &p3, &p2, &p1, &p0); 8845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) for (k = 3; k != 0; --k) { 8855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t q0, q1, q2, q3; 8865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) p += 4; 8875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load4x16(p + 2, stride, &q0, &q1, &q2, &q3); 8885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 8895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = 8905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); 8915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 8925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); 8935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store4x16(p1, p0, p3, p2, p, stride); 8945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) p1 = q2; 8955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) p0 = q3; 8965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 8975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 8985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 8995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif // !WORK_AROUND_GCC 9005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 9015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// 8-pixels wide variant, for chroma filtering 9025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void VFilter8(uint8_t* u, uint8_t* v, int stride, 9035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh, int ithresh, int hev_thresh) { 9045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 9055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 9065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 9075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 9085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ithresh, thresh); 9095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 9105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t op2, op1, op0, oq0, oq1, oq2; 9115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, 9125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) &op2, &op1, &op0, &oq0, &oq1, &oq2); 9135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride); 9145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride); 9155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); 9165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 9175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 9185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void VFilter8i(uint8_t* u, uint8_t* v, int stride, 9195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh, int ithresh, int hev_thresh) { 9205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 9215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) u += 4 * stride; 9225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) v += 4 * stride; 9235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 9245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 9255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 9265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ithresh, thresh); 9275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 9285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t op1, op0, oq0, oq1; 9295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); 9305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store8x4x2(op1, op0, oq0, oq1, u, v, stride); 9315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 9325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 9335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 9345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC) 9355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void HFilter8(uint8_t* u, uint8_t* v, int stride, 9365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh, int ithresh, int hev_thresh) { 9375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 9385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 9395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 9405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 9415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ithresh, thresh); 9425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 9435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t op2, op1, op0, oq0, oq1, oq2; 9445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, 9455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) &op2, &op1, &op0, &oq0, &oq1, &oq2); 9465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride); 9475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 9485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 9495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 9505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void HFilter8i(uint8_t* u, uint8_t* v, int stride, 9515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int thresh, int ithresh, int hev_thresh) { 9525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; 9535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) u += 4; 9545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) v += 4; 9555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 9565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 9575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, 9585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ithresh, thresh); 9595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); 9605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x16_t op1, op0, oq0, oq1; 9615f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); 9625f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Store4x8x2(op1, op0, oq0, oq1, u, v, stride); 9635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 9655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif // !WORK_AROUND_GCC 9665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9672a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//----------------------------------------------------------------------------- 9682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Inverse transforms (Paragraph 14.4) 9692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 9705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Technically these are unsigned but vqdmulh is only available in signed. 9715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// vqdmulh returns high half (effectively >> 16) but also doubles the value, 9725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// changing the >> 16 to >> 15 and requiring an additional >> 1. 9735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// We use this to our advantage with kC2. The canonical value is 35468. 9745f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// However, the high bit is set so treating it as signed will give incorrect 9755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// results. We avoid this by down shifting by 1 here to clear the highest bit. 9765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Combined with the doubling effect of vqdmulh we get >> 16. 9775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// This can not be applied to kC1 because the lowest bit is set. Down shifting 9785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// the constant would reduce precision. 9795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 9805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// libwebp uses a trick to avoid some extra addition that libvpx does. 9815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// Instead of: 9825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 9835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the 9845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)// same issue with kC1 and vqdmulh that we work around by down shifting kC2 9855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 9865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static const int16_t kC1 = 20091; 9875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 9885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 9895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if defined(USE_INTRINSICS) 9905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, 9915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int16x8x2_t* const out) { 9925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 9935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 9945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... 9955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // b0 d0 b1 d1 b2 d2 ... 9965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); 9975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 9985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 9995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { 10005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // {rows} = in0 | in4 10015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // in8 | in12 10025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // B1 = in4 | in12 10035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t B1 = 10045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1])); 10055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // C0 = kC1 * in4 | kC1 * in12 10065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // C1 = kC2 * in4 | kC2 * in12 10075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1); 10085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2); 10095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]), 10105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_low_s16(rows->val[1])); // in0 + in8 10115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]), 10125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) vget_low_s16(rows->val[1])); // in0 - in8 10135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // c = kC2 * in4 - kC1 * in12 10145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // d = kC1 * in4 + kC2 * in12 10155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0)); 10165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1)); 10175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b 10185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c 10195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c 10205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c 10215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); 10225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Transpose8x2(E0, E1, rows); 10235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 10245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 10255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static void TransformOne(const int16_t* in, uint8_t* dst) { 10265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int16x8x2_t rows; 10275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); 10285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) TransformPass(&rows); 10295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) TransformPass(&rows); 10305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Add4x4(rows.val[0], rows.val[1], dst); 10315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 10325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 10335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#else 10345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void TransformOne(const int16_t* in, uint8_t* dst) { 10365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int kBPS = BPS; 10375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // kC1, kC2. Padded because vld1.16 loads 8 bytes 10385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16_t constants[4] = { kC1, kC2, 0, 0 }; 10395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */ 10405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __asm__ volatile ( 10415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.16 {q1, q2}, [%[in]] \n" 10425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.16 {d0}, [%[constants]] \n" 10435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d2: in[0] 10455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d3: in[8] 10465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d4: in[4] 10475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d5: in[12] 10485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 10495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d3, d4 \n" 10505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* q8 = {in[4], in[12]} * kC1 * 2 >> 16 10525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * q9 = {in[4], in[12]} * kC2 >> 16 10535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 10545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q8, q2, d0[0] \n" 10555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q9, q2, d0[1] \n" 10565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d22 = a = in[0] + in[8] 10585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d23 = b = in[0] - in[8] 10595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 10605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d22, d2, d3 \n" 10615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d23, d2, d3 \n" 10625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* The multiplication should be x * kC1 >> 16 10645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * However, with vqdmulh we get x * kC1 * 2 >> 16 10655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * (multiply, double, return high half) 10665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * We avoided this in kC2 by pre-shifting the constant. 10675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * q8 = in[4]/[12] * kC1 >> 16 10685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 10695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s16 q8, q8, #1 \n" 10705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* Add {in[4], in[12]} back after the multiplication. This is handled by 10725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * adding 1 << 16 to kC1 in the libwebp C code. 10735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 10745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q8, q2, q8 \n" 10755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d20 = c = in[4]*kC2 - in[12]*kC1 10775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d21 = d = in[4]*kC1 + in[12]*kC2 10785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 10795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d20, d18, d17 \n" 10805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d21, d19, d16 \n" 10815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d2 = tmp[0] = a + d 10835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d3 = tmp[1] = b + c 10845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d4 = tmp[2] = b - c 10855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d5 = tmp[3] = a - d 10865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 10875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d2, d22, d21 \n" 10885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d3, d23, d20 \n" 10895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d4, d23, d20 \n" 10905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d5, d22, d21 \n" 10915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 10935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 10945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d3, d4 \n" 10965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 10985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * q9 = {tmp[4], tmp[12]} * kC2 >> 16 10995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 11005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q8, q2, d0[0] \n" 11015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q9, q2, d0[1] \n" 11025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d22 = a = tmp[0] + tmp[8] 11045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d23 = b = tmp[0] - tmp[8] 11055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 11065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d22, d2, d3 \n" 11075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d23, d2, d3 \n" 11085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* See long winded explanations prior */ 11105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s16 q8, q8, #1 \n" 11115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q8, q2, q8 \n" 11125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d20 = c = in[4]*kC2 - in[12]*kC1 11145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d21 = d = in[4]*kC1 + in[12]*kC2 11155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 11165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d20, d18, d17 \n" 11175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d21, d19, d16 \n" 11185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d2 = tmp[0] = a + d 11205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d3 = tmp[1] = b + c 11215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d4 = tmp[2] = b - c 11225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d5 = tmp[3] = a - d 11235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 11245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d2, d22, d21 \n" 11255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d3, d23, d20 \n" 11265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d4, d23, d20 \n" 11275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d5, d22, d21 \n" 11285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d6[0], [%[dst]], %[kBPS] \n" 11305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d6[1], [%[dst]], %[kBPS] \n" 11315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d7[0], [%[dst]], %[kBPS] \n" 11325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d7[1], [%[dst]], %[kBPS] \n" 11335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[dst], %[dst], %[kBPS], lsl #2 \n" 11355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* (val) + 4 >> 3 */ 11375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d2, d2, #3 \n" 11385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d3, d3, #3 \n" 11395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d4, d4, #3 \n" 11405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d5, d5, #3 \n" 11415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 11435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 11445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* Must accumulate before saturating */ 11465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmovl.u8 q8, d6 \n" 11475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmovl.u8 q9, d7 \n" 11485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q1, q1, q8 \n" 11505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q2, q2, q9 \n" 11515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqmovun.s16 d0, q1 \n" 11535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqmovun.s16 d1, q2 \n" 11545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 11565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 11575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 11585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d1[1], [%[dst]] \n" 11595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 11615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ 11625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ 11635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ); 11645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif // USE_INTRINSICS 11675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 11685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { 11695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) TransformOne(in, dst); 11705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (do_two) { 11715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) TransformOne(in + 16, dst + 4); 11725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static void TransformDC(const int16_t* in, uint8_t* dst) { 11765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t DC = vdupq_n_s16(in[0]); 11775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Add4x4(DC, DC, dst); 11785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} 11795d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 11805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------ 11815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 11825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define STORE_WHT(dst, col, rows) do { \ 11835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \ 11845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \ 11855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \ 11865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ 11875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)} while (0) 11885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 11895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void TransformWHT(const int16_t* in, int16_t* out) { 11905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) int32x4x4_t tmp; 11915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 11925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 11935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Load the source. 11945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t in00_03 = vld1_s16(in + 0); 11955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t in04_07 = vld1_s16(in + 4); 11965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t in08_11 = vld1_s16(in + 8); 11975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t in12_15 = vld1_s16(in + 12); 11985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15] 11995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11] 12005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11] 12015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15] 12025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[0] = vaddq_s32(a0, a1); 12035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[1] = vaddq_s32(a3, a2); 12045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[2] = vsubq_s32(a0, a1); 12055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[3] = vsubq_s32(a3, a2); 12065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // Arrange the temporary results column-wise. 12075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp = Transpose4x4(tmp); 12085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 12095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 12105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) { 12115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t kCst3 = vdupq_n_s32(3); 12125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder 12135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]); 12145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]); 12155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]); 12165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]); 12175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 12185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[0] = vaddq_s32(a0, a1); 12195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[1] = vaddq_s32(a3, a2); 12205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[2] = vsubq_s32(a0, a1); 12215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[3] = vsubq_s32(a3, a2); 12225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 12235f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) // right shift the results by 3. 12245f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[0] = vshrq_n_s32(tmp.val[0], 3); 12255f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[1] = vshrq_n_s32(tmp.val[1], 3); 12265f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[2] = vshrq_n_s32(tmp.val[2], 3); 12275f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) tmp.val[3] = vshrq_n_s32(tmp.val[3], 3); 12285f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 12295f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE_WHT(out, 0, tmp); 12305f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE_WHT(out, 1, tmp); 12315f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE_WHT(out, 2, tmp); 12325f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) STORE_WHT(out, 3, tmp); 12335f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) } 12345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 12355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 12365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef STORE_WHT 12375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 12385f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)//------------------------------------------------------------------------------ 12395f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 12405f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define MUL(a, b) (((a) * (b)) >> 16) 12415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)static void TransformAC3(const int16_t* in, uint8_t* dst) { 12425f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) static const int kC1_full = 20091 + (1 << 16); 12435f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) static const int kC2_full = 35468; 12445f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t A = vdup_n_s16(in[0]); 12455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); 12465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); 12475f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int c1 = MUL(in[1], kC2_full); 12485f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int d1 = MUL(in[1], kC1_full); 12495f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 | 12505f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) (uint64_t)( c1 & 0xffff) << 16 | 12515f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) (uint64_t)(-c1 & 0xffff) << 32 | 12525f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) (uint64_t)(-d1 & 0xffff) << 48; 12535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t CD = vcreate_s16(cd); 12545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x4_t B = vqadd_s16(A, CD); 12555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4)); 12565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); 12575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) Add4x4(m0_m1, m2_m3, dst); 12582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 12595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#undef MUL 12602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 12612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif // WEBP_USE_NEON 12622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 12632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//------------------------------------------------------------------------------ 12642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Entry point 12652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 12665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern void VP8DspInitNEON(void); 12675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void VP8DspInitNEON(void) { 12692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON) 12705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) VP8Transform = TransformTwo; 12715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8TransformAC3 = TransformAC3; 12725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) VP8TransformDC = TransformDC; 12732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) VP8TransformWHT = TransformWHT; 12745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8VFilter16 = VFilter16; 12765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8VFilter16i = VFilter16i; 12775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8HFilter16 = HFilter16; 12785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC) 12795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8HFilter16i = HFilter16i; 12805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif 12815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8VFilter8 = VFilter8; 12825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8VFilter8i = VFilter8i; 12835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#if !defined(WORK_AROUND_GCC) 12845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8HFilter8 = HFilter8; 12855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8HFilter8i = HFilter8i; 12865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#endif 12875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8SimpleVFilter16 = SimpleVFilter16; 12885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8SimpleHFilter16 = SimpleHFilter16; 12895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8SimpleVFilter16i = SimpleVFilter16i; 12905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) VP8SimpleHFilter16i = SimpleHFilter16i; 12912a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif // WEBP_USE_NEON 12925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1293